在下面这段代码的基础上进行修改import math from collections import defaultdict corpus =["二价 二价 二价 四价 预约", "四价 四价 四价 九价 预约", "九价 九价 九价 九价 预约"] words = [] for sentence in corpus: words.append(sentence.strip().split()) # 进行词频统计 def Counter(words): word_count = [] for sentence in words: word_dict = defaultdict(int) for word in sentence: word_dict[word] += 1 word_count.append(word_dict) return word_count word_count = Counter(words) # 计算TF(word代表被计算的单词,word_dict是被计算单词所在句子分词统计词频后的字典) def tf(word, word_dict): return word_dict[word] / sum(word_dict.values()) # 统计含有该单词的句子数 def count_sentence(word, word_count): return sum([1 for i in word_count if i.get(word)]) # i[word] >= 1 # 计算IDF def idf(word, word_count): return math.log((len(word_count) / (count_sentence(word, word_count) + 1)),10) # 计算TF-IDF def tfidf(word, word_dict, word_count): return tf(word, word_dict) * idf(word, word_count) p = 1 for word_dict in word_count: print("part:{}".format(p)) p += 1 for word, cnt in word_dict.items(): print("word: {} ---- TF-IDF:{}".format(word, tfidf(word, word_dict, word_count))) print("word: {} ---- TF:{}".format(word, tf(word, word_dict))) print("word: {} ---- IDF:{}".format(word, idf(word, word_count))) print("word: {} ---- count_sentence:{}".format(word, count_sentence(word, word_count))),将IDF进行改进,其中自定义热度权重文件weight.txt中我想存入的是每一个文档的热度权重,改进的idf值就是总文档热度权重总和除以包含某词所在的文档的热度权重之和然后再取对数,请写出改进后的python代码
时间: 2024-03-10 22:49:16 浏览: 69
Python collections.defaultdict模块用法详解
以下是改进后的代码:
```
import math
from collections import defaultdict
corpus =["二价 二价 二价 四价 预约", "四价 四价 四价 九价 预约", "九价 九价 九价 九价 预约"]
words = []
for sentence in corpus:
words.append(sentence.strip().split())
# 进行词频统计
def Counter(words):
word_count = []
for sentence in words:
word_dict = defaultdict(int)
for word in sentence:
word_dict[word] += 1
word_count.append(word_dict)
return word_count
word_count = Counter(words)
# 读取热度权重文件
with open('weight.txt', 'r') as f:
weight = [float(line.strip()) for line in f.readlines()]
# 计算IDF
def idf(word, word_count, weight):
count = count_sentence(word, word_count)
if count == 0:
return 0
else:
return math.log(sum(weight) / (weighted_count_sentence(word, word_count, weight) + 1), 10)
# 统计包含该单词的文档数
def count_sentence(word, word_count):
return sum([1 for i in word_count if i.get(word)])
# 统计包含该单词的文档的热度权重之和
def weighted_count_sentence(word, word_count, weight):
count = 0
for i in word_count:
if i.get(word):
count += weight[word_count.index(i)]
return count
# 计算TF-IDF
def tfidf(word, word_dict, word_count, weight):
return tf(word, word_dict) * idf(word, word_count, weight)
# 计算TF
def tf(word, word_dict):
return word_dict[word] / sum(word_dict.values())
# 输出结果
p = 1
for word_dict in word_count:
print("part:{}".format(p))
p += 1
for word, cnt in word_dict.items():
print("word: {} ---- TF-IDF:{}".format(word, tfidf(word, word_dict, word_count, weight)))
print("word: {} ---- TF:{}".format(word, tf(word, word_dict)))
print("word: {} ---- IDF:{}".format(word, idf(word, word_count, weight)))
print("word: {} ---- count_sentence:{}".format(word, count_sentence(word, word_count)))
```
阅读全文