现有好评和差评两个文本文件,我需要对这两个中文文本文件作为训练集,使用sklearn实现贝叶斯分类,区分好评和差评,并给出准确率和结果可视化图,给出示例代码
时间: 2024-02-03 18:13:39 浏览: 59
以下是实现贝叶斯分类的示例代码:
```python
import os
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# 读取数据
def load_data():
pos_file = "./pos.txt"
neg_file = "./neg.txt"
pos_data = []
neg_data = []
with open(pos_file, "r", encoding="utf-8") as f:
for line in f.readlines():
pos_data.append(line.strip())
with open(neg_file, "r", encoding="utf-8") as f:
for line in f.readlines():
neg_data.append(line.strip())
return pos_data, neg_data
# 分词
def seg_text(text):
words = jieba.cut(text)
return " ".join(words)
# 特征提取
def extract_feature(train_data):
vectorizer = CountVectorizer()
feature = vectorizer.fit_transform(train_data)
return vectorizer, feature
# 训练模型
def train_model(pos_data, neg_data):
pos_data = [seg_text(text) for text in pos_data]
neg_data = [seg_text(text) for text in neg_data]
train_data = pos_data + neg_data
label = [1]*len(pos_data) + [0]*len(neg_data)
vectorizer, feature = extract_feature(train_data)
clf = MultinomialNB()
clf.fit(feature, label)
return clf, vectorizer
# 预测
def predict(clf, vectorizer, test_data):
test_data = [seg_text(text) for text in test_data]
test_feature = vectorizer.transform(test_data)
return clf.predict(test_feature)
# 计算准确率
def calc_accuracy(label, pred):
acc = accuracy_score(label, pred)
return acc
# 可视化结果
def plot_result(pos_acc, neg_acc):
plt.bar(range(2), [pos_acc, neg_acc], tick_label=["Positive", "Negative"])
plt.ylim(0, 1)
plt.ylabel("Accuracy")
plt.show()
if __name__ == "__main__":
pos_data, neg_data = load_data()
train_pos_data = pos_data[:800] # 取前800条数据作为训练集
train_neg_data = neg_data[:800]
test_pos_data = pos_data[800:] # 取后200条数据作为测试集
test_neg_data = neg_data[800:]
clf, vectorizer = train_model(train_pos_data, train_neg_data)
pos_pred = predict(clf, vectorizer, test_pos_data)
neg_pred = predict(clf, vectorizer, test_neg_data)
pos_acc = calc_accuracy([1]*len(pos_pred), pos_pred)
neg_acc = calc_accuracy([0]*len(neg_pred), neg_pred)
print("Positive Accuracy: {:.2f}%".format(pos_acc*100))
print("Negative Accuracy: {:.2f}%".format(neg_acc*100))
plot_result(pos_acc, neg_acc)
```
在运行上述代码前,请确保已安装好相关的依赖包,并准备好 `pos.txt` 和 `neg.txt` 两个文本文件,文件中每行为一条评价。代码中将前800条数据作为训练集,后200条数据作为测试集,可以根据实际情况进行调整。
运行结果会输出两个分类的准确率,并给出一个柱状图可视化结果。
阅读全文