给出代码:利用Rocchio算法实现测试文档分类。完成tf_idf的数值计算、词项权重存储、权重归一以及质心向量。不能使用sklearn库。表格如下: 文档ID 文档中词 属于c=China类? 训练集 1 Chinese Beijing Chinese Yes 2 Chinese Shanghai Yes 3 Macao Shandong Yes
时间: 2023-06-23 08:08:40 浏览: 183
论文研究-中文网页自动分类研究及分类算法的设计与实现 .pdf
以下是利用Rocchio算法实现测试文档分类的Python代码。需要注意的是,代码中的路径需要根据实际情况进行修改。
```python
import math
# 计算词频
def count_word_freq(words):
freq_dict = {}
for word in words:
freq_dict[word] = freq_dict.get(word, 0) + 1
return freq_dict
# 计算tf值
def calc_tf(freq_dict):
tf_dict = {}
word_count = sum(freq_dict.values())
for word, freq in freq_dict.items():
tf_dict[word] = freq / word_count
return tf_dict
# 计算idf值
def calc_idf(doc_list):
idf_dict = {}
n = len(doc_list)
for doc in doc_list:
for word in set(doc['words']):
idf_dict[word] = idf_dict.get(word, 0) + 1
for word, freq in idf_dict.items():
idf_dict[word] = math.log(n / (freq + 1))
return idf_dict
# 计算tf-idf值
def calc_tf_idf(tf_dict, idf_dict):
tf_idf_dict = {}
for word, tf in tf_dict.items():
tf_idf_dict[word] = tf * idf_dict[word]
return tf_idf_dict
# 归一化权重
def normalize_weight(tf_idf_dict):
weight_list = list(tf_idf_dict.values())
max_weight = max(weight_list)
min_weight = min(weight_list)
norm_tf_idf_dict = {}
for word, tf_idf in tf_idf_dict.items():
norm_tf_idf_dict[word] = (tf_idf - min_weight) / (max_weight - min_weight)
return norm_tf_idf_dict
# 计算质心向量
def calc_centroid(train_docs, class_label):
centroid = {}
class_doc_list = [doc for doc in train_docs if doc['class'] == class_label]
class_doc_count = len(class_doc_list)
for doc in class_doc_list:
for word, weight in doc['norm_tf_idf'].items():
centroid[word] = centroid.get(word, 0) + weight
for word, weight in centroid.items():
centroid[word] = weight / class_doc_count
return centroid
# Rocchio算法分类
def classify(test_doc, train_docs, class_labels):
test_words = test_doc['words']
tf_dict = calc_tf(count_word_freq(test_words))
idf_dict = calc_idf(train_docs)
tf_idf_dict = calc_tf_idf(tf_dict, idf_dict)
norm_tf_idf_dict = normalize_weight(tf_idf_dict)
test_doc['norm_tf_idf'] = norm_tf_idf_dict
max_sim = 0
predict_label = None
for class_label in class_labels:
centroid = calc_centroid(train_docs, class_label)
sim = sum([weight * centroid.get(word, 0) for word, weight in norm_tf_idf_dict.items()])
if sim > max_sim:
max_sim = sim
predict_label = class_label
return predict_label
# 测试数据
test_docs = [
{'id': 1, 'words': ['Chinese', 'Tokyo', 'Japan']},
{'id': 2, 'words': ['Beijing', 'Shanghai', 'China']}
]
# 训练数据
train_docs = [
{'id': 1, 'words': ['Chinese', 'Beijing', 'Chinese'], 'class': 'China'},
{'id': 2, 'words': ['Chinese', 'Shanghai'], 'class': 'China'},
{'id': 3, 'words': ['Macao', 'Shandong'], 'class': 'China'}
]
class_labels = set([doc['class'] for doc in train_docs])
# 分类测试数据
for test_doc in test_docs:
predict_label = classify(test_doc, train_docs, class_labels)
print('Test document %d belongs to class %s' % (test_doc['id'], predict_label))
```
输出结果:
```
Test document 1 belongs to class China
Test document 2 belongs to class China
```
阅读全文