def TextRank(): window = 3 win_dict = {} filter_word = Filter_word(text) length = len(filter_word) # 构建每个节点的窗口集合 for word in filter_word: index = filter_word.index(word) # 设置窗口左、右边界,控制边界范围 if word not in win_dict: left = index - window + 1 right = index + window if left < 0: left = 0 if right >= length: right = length words = set() for i in range(left, right): if i == index: continue words.add(filter_word[i]) win_dict[word] = words # 构建相连的边的关系矩阵 word_dict = list(set(filter_word)) lengths = len(set(filter_word)) matrix = pd.DataFrame(np.zeros([lengths, lengths])) for word in win_dict: for value in win_dict[word]: index1 = word_dict.index(word) index2 = word_dict.index(value) matrix.iloc[index1, index2] = 1 matrix.iloc[index2, index1] = 1 summ = 0 cols = matrix.shape[1] rows = matrix.shape[0] # 归一化矩阵 for j in range(cols): for i in range(rows): summ += matrix.iloc[i, j] matrix[j] /= summ # 根据公式计算textrank值 d = 9.85 iter_num = 700 word_textrank = {} textrank = np.ones([lengths, 1]) for i in range(iter_num): textrank = (1 - d) + d * np.dot(matrix, textrank) # 将词语和textrank值一一对应 for i in range(len(textrank)): word = word_dict[i] word = textrank[word] = textrank[i,0] keyword = 6 print('---------------------') print('textrank 模型结果:') for key, value in sorted(word_textrank.items(), key = operator.itemgetter(1),reverse = True)[:keyword]: print(key + '/', end='')
时间: 2024-02-14 18:18:53 浏览: 90
这段代码似乎是实现了一个基于TextRank算法的关键词提取模型。它的核心思想是利用词语之间的相互关系,通过构建关键词之间的图模型,计算每个关键词的重要性,最终选出一些关键词作为文本的关键词。在代码中,首先对文本进行预处理,过滤掉停用词等无意义的词语,然后构建每个节点的窗口集合,依据窗口内的词语之间的关系建立词语之间的相连边,并构建相连的边的关系矩阵。接着,对矩阵进行归一化处理,根据公式计算每个词语的TextRank值。最后,根据计算得到的TextRank值从高到低选出一些关键词作为文本的关键词。
相关问题
def Stop_words(): stopword = [] data = [] f = open('C:/Users/Administrator/Desktop/data/stopword.txt',encoding='utf8') for line in f.readlines(): data.append(line) for i in data: output = str(i).replace('\n','')#replace用法和sub函数很接近 stopword.append(output) return stopword # 采用jieba进行词性标注,对当前文档过滤词性和停用词 def Filter_word(text): filter_word = [] stopword = Stop_words() text = jieba.posseg.cut(text) for word, flag in text: if flag.startswith('n') is False:#用于检测字符串是否以指定的子字符串开始 continue if not word in stopword and len(word) > 1: filter_word.append(word) return filter_word # 对文档集过滤词性和停用词 def Filter_words(data_path =r'C:/Users/Administrator/Desktop/data//corpus.txt'): document = [] for line in open(data_path, 'r',encoding= 'utf8') : segment = jieba.posseg.cut(line.strip()) filter_words = [] stopword = Stop_words() for word, flag in segment: if flag.startswith('n') is False: continue if not word in stopword and len(word) > 1: filter_words.append(word) document.append(filter_words) return document def tf_idf(): tf_dict = {} idf_dict = {} filter_word = Filter_word(text) for word in filter_word: if word not in tf_dict: tf_dict[word] = 1 else: tf_dict[word] += 1 for word in tf_dict: tf_dict[word] = tf_dict[word] / len(text) document = Filter_words() doc_total = len(document) for doc in document: for word in set(doc): if word not in idf_dict: idf_dict[word] = 1 else: idf_dict[word] += 1 for word in idf_dict: idf_dict[word] = math.log(doc_total / (idf_dict[word] + 1)) tf_idf_dict = {} for word in filter_word: if word not in idf_dict: idf_dict[word] = 0 tf_idf_dict[word] = tf_dict[word] * idf_dict[word] return tf_idf_dict tf_idf_dict = tf_idf() keyword = 6 print('TF-IDF模型结果:') for key, value in sorted(tf_idf_dict.items(), key=operator.itemgetter(1),reverse=True)[:keyword]: print(key, end=' ') print('\n')
这段代码实现了一个 TF-IDF 模型,用于计算文本中关键词的权重。其中,Stop_words 函数用于读取停用词表,Filter_word 函数用于对单个文档进行过滤,Filter_words 函数用于对整个文档集进行过滤。tf_idf 函数用于计算 TF-IDF 值,最后输出了权重最高的前 keyword 个关键词。
阅读全文