给下面这段代码中的预测结果实现可视化操作:from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB import jieba from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pyplot as plt good_comments = [] bad_comments = [] with open('D:\PyCharmProjects\爬虫测试\好评.txt', 'r', encoding='gbk') as f: for line in f.readlines(): good_comments.append(line.strip('\n')) with open('D:\PyCharmProjects\爬虫测试\差评.txt', 'r', encoding='gbk') as f: for line in f.readlines(): bad_comments.append(line.strip('\n')) with open('StopWords.txt', 'r', encoding='utf-8') as f: stopwords = f.read().splitlines() good_words = [] for line in good_comments: words = jieba.cut(line, cut_all=False) words = [w for w in words if w not in stopwords] good_words.append(' '.join(words)) bad_words = [] for line in bad_comments: words = jieba.cut(line, cut_all=False) words = [w for w in words if w not in stopwords] bad_words.append(' '.join(words)) # 将文本转换为向量 vectorizer = CountVectorizer() X = vectorizer.fit_transform(good_words + bad_words) y = [1] * len(good_words) + [0] * len(bad_words) # 将数据分为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 训练模型 clf = MultinomialNB() clf.fit(X_train, y_train) # 测试模型并计算准确率 pred = clf.predict(X_test) accuracy = sum(pred == y_test) / len(y_test) print('准确率:{:.2%}'.format(accuracy)) # 预测新数据的类别 with open('测试评论.txt', 'r', encoding='utf-8') as f: count = 0 for line in f.readlines(): count += 1 test_text = line.strip('\n') test_words = ' '.join(jieba.cut(test_text, cut_all=False)) test_vec = vectorizer.transform([test_words]) pred = clf.predict(test_vec) if pred[0] == 1: print(count, '好评') else: print(count, '差评')
时间: 2024-01-11 19:02:34 浏览: 86
下面是代码实现可视化的过程:
首先,我们可以使用以下代码可视化训练集和测试集的数量分布:
```python
plt.figure(figsize=(8,6))
plt.bar(['Train', 'Test'], [len(y_train), len(y_test)], color=['b', 'g'])
plt.xlabel('Dataset')
plt.ylabel('Number of Comments')
plt.title('Distribution of Dataset')
plt.show()
```
接下来,我们可以使用以下代码绘制混淆矩阵,以便可视化分类器的性能:
```python
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, pred)
plt.figure(figsize=(6,6))
plt.imshow(conf_mat, cmap=plt.cm.Blues, interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['Negative', 'Positive'], rotation=45)
plt.yticks(tick_marks, ['Negative', 'Positive'])
plt.tight_layout()
for i, j in itertools.product(range(conf_mat.shape[0]), range(conf_mat.shape[1])):
plt.text(j, i, format(conf_mat[i, j], 'd'),
horizontalalignment="center",
color="white" if conf_mat[i, j] > conf_mat.max() / 2. else "black")
plt.show()
```
最后,我们可以使用以下代码可视化分类器的准确率和召回率:
```python
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
plt.figure(figsize=(8,6))
plt.bar(['Precision', 'Recall', 'F1'], [precision, recall, f1], color=['b', 'g', 'r'])
plt.ylim(0,1)
plt.xlabel('Metric')
plt.ylabel('Score')
plt.title('Performance Metrics')
plt.show()
```
阅读全文