用python编写贝叶斯中文文本分类器,训练集和测试集五五分,并输出精确率、召回率和f值
时间: 2023-06-16 19:06:05 浏览: 182
人工智能大作业基于TF-IDF+手写朴素贝叶斯实现文本分类python源码+文档说明.zip
好的,下面是一个简单的贝叶斯中文文本分类器的代码,并计算精确率、召回率和f值:
```python
import os
import jieba
import math
# 停用词文件路径
stop_words_file = 'stop_words.txt'
# 计算先验概率
def calc_prior_prob(train_path):
num_docs = 0
class_word_count = {}
class_doc_count = {}
# 遍历训练集
for root, dirs, files in os.walk(train_path):
for file in files:
num_docs += 1
class_name = os.path.basename(root)
if class_name not in class_word_count:
class_word_count[class_name] = {}
class_doc_count[class_name] = 0
# 统计词频
with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
for line in f:
words = jieba.cut(line.strip())
for word in words:
# 去停用词
if word in stop_words:
continue
if word not in class_word_count[class_name]:
class_word_count[class_name][word] = 1
else:
class_word_count[class_name][word] += 1
class_doc_count[class_name] += 1
# 计算每个类别的先验概率
prior_prob = {}
for class_name in class_doc_count:
prior_prob[class_name] = class_doc_count[class_name] / num_docs
return class_word_count, prior_prob
# 计算条件概率
def calc_conditional_prob(class_word_count):
conditional_prob = {}
for class_name in class_word_count:
conditional_prob[class_name] = {}
total_words = sum(class_word_count[class_name].values())
for word in class_word_count[class_name]:
conditional_prob[class_name][word] = (class_word_count[class_name][word] + 1) / (total_words + len(vocab))
return conditional_prob
# 文本分类
def classify(test_path, class_word_count, prior_prob, conditional_prob):
correct = 0
total = 0
TP = 0
FP = 0
FN = 0
# 遍历测试集
for root, dirs, files in os.walk(test_path):
for file in files:
total += 1
class_name = os.path.basename(root)
scores = {}
# 统计词频
with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
for line in f:
words = jieba.cut(line.strip())
for word in words:
# 去停用词
if word in stop_words:
continue
for c in class_word_count:
if word not in scores[c]:
scores[c][word] = 0
scores[c][word] += math.log(conditional_prob[c][word])
# 计算每个类别的得分
for c in class_word_count:
scores[c] = math.log(prior_prob[c]) + sum(scores[c].values())
# 取得分最大的类别
predict_class = max(scores, key=scores.get)
# 计算精确率、召回率和f值
if predict_class == class_name:
correct += 1
TP += 1
else:
FP += 1
FN += 1
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * precision * recall / (precision + recall)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1_score)
# 加载停用词
stop_words = set()
with open(stop_words_file, 'r', encoding='utf-8') as f:
for line in f:
stop_words.add(line.strip())
# 训练集和测试集路径
train_path = 'train'
test_path = 'test'
# 统计词汇表
vocab = set()
for root, dirs, files in os.walk(train_path):
for file in files:
with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
for line in f:
words = jieba.cut(line.strip())
for word in words:
# 去停用词
if word in stop_words:
continue
vocab.add(word)
# 计算先验概率
class_word_count, prior_prob = calc_prior_prob(train_path)
# 计算条件概率
conditional_prob = calc_conditional_prob(class_word_count)
# 文本分类
classify(test_path, class_word_count, prior_prob, conditional_prob)
```
其中,假设训练集和测试集的路径分别为 `train` 和 `test`。`calc_prior_prob` 函数用于计算先验概率,`calc_conditional_prob` 函数用于计算条件概率,`classify` 函数用于进行文本分类,并计算精确率、召回率和f值。
注意,由于中文文本的特殊性,我们需要使用中文分词工具 `jieba` 进行分词。同时,为了提高分类器的性能,我们还需要使用停用词表进行去停用词处理。
以上代码可能还需要进行一些细节的调整,但是基本的思路应该是可以参考的。
阅读全文