with open('tf_idf.txt', 'w', encoding='utf-8') as f: for word, tf in tf_values.items(): idf = idf_values.get(word, 0) tf_idf = tf * idf f.write('{}\t{}\n'.format(word, tf_idf))将上述代码的结果改为降序输出
时间: 2023-08-10 21:09:40 浏览: 48
可以将代码中的`tf_idf`值存入一个列表中,然后使用`sorted`函数对列表进行降序排序,最后再将排序后的结果写入文件中。修改后的代码如下:
```
with open('tf_idf.txt', 'w', encoding='utf-8') as f:
tf_idf_values = []
for word, tf in tf_values.items():
idf = idf_values.get(word, 0)
tf_idf = tf * idf
tf_idf_values.append((word, tf_idf))
tf_idf_values = sorted(tf_idf_values, key=lambda x: x[1], reverse=True)
for word, tf_idf in tf_idf_values:
f.write('{}\t{}\n'.format(word, tf_idf))
```
这样就可以将结果按照tf-idf值降序输出到文件中。
相关问题
def Stop_words(): stopword = [] data = [] f = open('C:/Users/Administrator/Desktop/data/stopword.txt',encoding='utf8') for line in f.readlines(): data.append(line) for i in data: output = str(i).replace('\n','')#replace用法和sub函数很接近 stopword.append(output) return stopword # 采用jieba进行词性标注,对当前文档过滤词性和停用词 def Filter_word(text): filter_word = [] stopword = Stop_words() text = jieba.posseg.cut(text) for word, flag in text: if flag.startswith('n') is False:#用于检测字符串是否以指定的子字符串开始 continue if not word in stopword and len(word) > 1: filter_word.append(word) return filter_word # 对文档集过滤词性和停用词 def Filter_words(data_path =r'C:/Users/Administrator/Desktop/data//corpus.txt'): document = [] for line in open(data_path, 'r',encoding= 'utf8') : segment = jieba.posseg.cut(line.strip()) filter_words = [] stopword = Stop_words() for word, flag in segment: if flag.startswith('n') is False: continue if not word in stopword and len(word) > 1: filter_words.append(word) document.append(filter_words) return document def tf_idf(): tf_dict = {} idf_dict = {} filter_word = Filter_word(text) for word in filter_word: if word not in tf_dict: tf_dict[word] = 1 else: tf_dict[word] += 1 for word in tf_dict: tf_dict[word] = tf_dict[word] / len(text) document = Filter_words() doc_total = len(document) for doc in document: for word in set(doc): if word not in idf_dict: idf_dict[word] = 1 else: idf_dict[word] += 1 for word in idf_dict: idf_dict[word] = math.log(doc_total / (idf_dict[word] + 1)) tf_idf_dict = {} for word in filter_word: if word not in idf_dict: idf_dict[word] = 0 tf_idf_dict[word] = tf_dict[word] * idf_dict[word] return tf_idf_dict tf_idf_dict = tf_idf() keyword = 6 print('TF-IDF模型结果:') for key, value in sorted(tf_idf_dict.items(), key=operator.itemgetter(1),reverse=True)[:keyword]: print(key, end=' ') print('\n')
这段代码实现了一个 TF-IDF 模型,用于计算文本中关键词的权重。其中,Stop_words 函数用于读取停用词表,Filter_word 函数用于对单个文档进行过滤,Filter_words 函数用于对整个文档集进行过滤。tf_idf 函数用于计算 TF-IDF 值,最后输出了权重最高的前 keyword 个关键词。
with open('test_result0.txt', 'r',encoding='utf-8') as f, openpyxl.Workbook() as wb: AttributeError: __enter__
这个错误通常是因为 `openpyxl.Workbook()` 没有返回一个 `with` 上下文管理器。解决方法是将 `openpyxl.Workbook()` 的返回值赋值给一个变量,然后将这个变量用于 `with` 语句。修改后的代码如下:
```python
import openpyxl
# 打开txt文件和Excel文件
with open('test_result0.txt', 'r', encoding='utf-8') as f:
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = 'Result'
# 逐行读取txt文件并处理
for line in f:
# 将"的tf-idf值为"替换成tab
line = line.replace('的tf-idf值为', '\t')
# 以tab为分隔符,分割字符串并保存到Excel文件中
parts = line.split('\t')
sheet.append([parts[0], float(parts[1])])
# 保存Excel文件
wb.save('result.xlsx')
```
这里将 `openpyxl.Workbook()` 的返回值赋值给 `wb` 变量,然后将这个变量用于 `with` 语句。这样就可以避免上述错误。