从文档集中提取TF-IDF值的降序排位前200的词汇
时间: 2023-06-10 07:06:49 浏览: 94
基于特定语料库的TF-IDF的中文关键词提取
好的,以下是提取TF-IDF值降序排位前200的词汇的Python代码:
```python
import os
import math
from collections import Counter
# 计算TF值
def compute_tf(word_freq, doc_words):
tf_dict = {}
doc_word_count = len(doc_words)
for word, freq in word_freq.items():
tf_dict[word] = freq / float(doc_word_count)
return tf_dict
# 计算IDF值
def compute_idf(doc_list):
idf_dict = {}
n = len(doc_list)
# 统计每个词在多少篇文档中出现过
idf_dict = dict.fromkeys(doc_list[0].keys(), 0)
for doc in doc_list:
for word, count in doc.items():
if count > 0:
idf_dict[word] += 1
# 计算每个词的IDF值
for word, count in idf_dict.items():
idf_dict[word] = math.log(n / float(count))
return idf_dict
# 计算TF-IDF值
def compute_tfidf(tf_dict, idf_dict):
tfidf_dict = {}
for word, tf in tf_dict.items():
tfidf_dict[word] = tf * idf_dict[word]
return tfidf_dict
# 读取文档集
doc_list = []
for filename in os.listdir('./docs'):
with open(os.path.join('./docs', filename), 'r') as f:
doc_words = f.read().lower().split()
word_freq = dict(Counter(doc_words))
doc_list.append(word_freq)
# 计算每个文档中每个词的TF-IDF值
tfidf_list = []
for doc in doc_list:
tf_dict = compute_tf(doc, doc_words)
idf_dict = compute_idf(doc_list)
tfidf_dict = compute_tfidf(tf_dict, idf_dict)
tfidf_list.append(tfidf_dict)
# 统计每个词在整个文档集中的TF-IDF值
word_tfidf = {}
for tfidf_dict in tfidf_list:
for word, tfidf in tfidf_dict.items():
if word in word_tfidf:
word_tfidf[word] += tfidf
else:
word_tfidf[word] = tfidf
# 将TF-IDF值降序排列并输出前200个词汇
sorted_word_tfidf = sorted(word_tfidf.items(), key=lambda x: x[1], reverse=True)
for word, tfidf in sorted_word_tfidf[:200]:
print(word, tfidf)
```
需要注意的是,这段代码假设文档集都存储在`./docs`目录下,每个文档都是纯文本文件。你需要根据自己的情况修改相应的路径和文件格式。
阅读全文