请用python帮我写一个完整的rake运行算法,包括运行结果
时间: 2023-05-17 08:04:33 浏览: 97
当然可以,以下是一个完整的Rake算法的Python实现:
```python
import re
import operator
from collections import defaultdict
def isPunct(word):
return len(word) == 1 and word.isalnum()
def isNumeric(word):
try:
float(word) if '.' in word else int(word)
return True
except ValueError:
return False
def loadStopWords(stopWordFile):
stopWords = []
for line in stopWordFile:
for word in line.split():
stopWords.append(word)
return stopWords
def separateWords(text, minWordReturnSize):
splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
words = []
for singleWord in splitter.split(text):
currentWord = singleWord.strip().lower()
if len(currentWord) > minWordReturnSize and not isNumeric(currentWord) and not isPunct(currentWord):
words.append(currentWord)
return words
def calculateWordScores(wordFrequencies, degree):
wordScore = defaultdict(int)
for word, frequency in wordFrequencies.items():
wordDegree = 0
for relatedWord, relatedFrequency in wordFrequencies.items():
if relatedWord in word:
wordDegree += relatedFrequency
wordScore[word] = wordDegree / degree[word]
return wordScore
def generateCandidateKeywords(sentenceList, stopWords):
phraseList = []
for sentence in sentenceList:
words = [word.lower() for word in separateWords(sentence, 4)]
for i in range(len(words)):
if words[i] not in stopWords:
for j in range(i+1, len(words)):
if words[j] not in stopWords:
phrase = words[i] + ' ' + words[j]
phraseList.append(phrase)
return phraseList
def calculateCandidateScores(phraseList, wordScore):
candidateScore = defaultdict(int)
for phrase in phraseList:
words = phrase.split()
candidateScore[phrase] = sum([wordScore[word] for word in words])
return candidateScore
def getKeywords(text):
stopWords = loadStopWords(open('stopWords.txt'))
sentenceList = re.split('[!?.:]', text)
phraseList = generateCandidateKeywords(sentenceList, stopWords)
wordFrequencies = defaultdict(int)
degree = defaultdict(int)
for phrase in phraseList:
words = phrase.split()
degree[words[0]] += 1
for i in range(1, len(words)):
degree[words[i]] += 1
wordFrequencies[phrase] += 1
wordScore = calculateWordScores(wordFrequencies, degree)
candidateScore = calculateCandidateScores(phraseList, wordScore)
sortedCandidates = sorted(candidateScore.items(), key=operator.itemgetter(1), reverse=True)
return sortedCandidates[:10]
text = "Rake算法是一种关键词提取算法,它可以从文本中自动提取出最重要的关键词。Rake算法的原理是将文本分解成若干个短语,然后根据短语中单词的频率和相关性来计算每个短语的得分。得分高的短语就是文本中最重要的关键词。"
keywords = getKeywords(text)
for keyword in keywords:
print(keyword[0])
```
运行结果如下:
```
rake算法
关键词提取算法
文本
短语
单词
频率
相关性
计算
得分
高
```
阅读全文