用python实现情感词典生成生成情感词然后朴素贝叶斯再去分类的代码
时间: 2024-02-22 16:59:21 浏览: 59
基于python朴素贝叶斯实现的豆瓣影评情感分析
好的,下面是一个简单的情感词典生成和朴素贝叶斯分类的Python代码示例:
```python
import os
import re
from collections import Counter
import math
# 情感词典文件路径
SENTIMENT_DICT_FILE = 'sentiment_dict.txt'
# 训练集文件路径
TRAIN_DATA_FILE = 'train_data.txt'
# 测试集文件路径
TEST_DATA_FILE = 'test_data.txt'
# 情感词典生成函数
def generate_sentiment_dict(train_data_file, output_file):
sentiment_words = Counter()
with open(train_data_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
words, label = line.split('\t')
if label == '1':
sentiment_words.update(words.split(' '))
with open(output_file, 'w', encoding='utf-8') as f:
for word, freq in sentiment_words.items():
f.write('{}\t{}\n'.format(word, freq))
# 朴素贝叶斯分类器
class NaiveBayesClassifier:
def __init__(self):
self.total_docs = 0 # 总文档数
self.class_docs = {} # 每个类别的文档数
self.word_freq = {} # 每个词在每个类别中的出现次数
self.classes = set() # 类别集合
self.vocab = set() # 词汇表
# 训练函数
def train(self, train_data_file):
with open(train_data_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
words, label = line.split('\t')
self.total_docs += 1
self.class_docs[label] = self.class_docs.get(label, 0) + 1
for word in words.split(' '):
self.word_freq[label] = self.word_freq.get(label, Counter())
self.word_freq[label][word] += 1
self.vocab.add(word)
self.classes.add(label)
# 预测函数
def predict(self, text):
words = re.findall(r'\w+', text)
scores = {c: math.log(self.class_docs[c] / self.total_docs) for c in self.classes}
for word in words:
if word not in self.vocab:
continue
for c in self.classes:
freq = self.word_freq[c].get(word, 0)
scores[c] += math.log((freq + 1) / (sum(self.word_freq[c].values()) + len(self.vocab)))
return max(scores, key=scores.get)
# 生成情感词典
generate_sentiment_dict(TRAIN_DATA_FILE, SENTIMENT_DICT_FILE)
# 加载情感词典
sentiment_dict = set()
with open(SENTIMENT_DICT_FILE, 'r', encoding='utf-8') as f:
for line in f:
word, freq = line.strip().split('\t')
if int(freq) > 10: # 过滤掉出现次数过少的词
sentiment_dict.add(word)
# 训练朴素贝叶斯分类器
classifier = NaiveBayesClassifier()
classifier.train(TRAIN_DATA_FILE)
# 测试朴素贝叶斯分类器
with open(TEST_DATA_FILE, 'r', encoding='utf-8') as f:
num_correct = 0
num_total = 0
for line in f:
line = line.strip()
if not line:
continue
text, label = line.split('\t')
pred_label = classifier.predict(text)
num_correct += int(pred_label == label)
num_total += 1
accuracy = num_correct / num_total
print('Accuracy: {:.2%}'.format(accuracy))
```
上面的代码实现了以下功能:
1. 从训练集文件中生成情感词典文件。
2. 加载情感词典和训练集文件,训练朴素贝叶斯分类器。
3. 加载测试集文件,测试朴素贝叶斯分类器的准确率。
请注意,这只是一个简单的示例代码,实际上情感分析需要更多的预处理和特征提取步骤。
阅读全文