使用朴素贝叶斯算法实现中文邮件分类
时间: 2023-10-03 16:03:08 浏览: 154
中文邮件分类的朴素贝叶斯算法实现步骤如下:
1. 准备数据集:将邮件分为训练集和测试集。训练集用于训练模型,测试集用于测试模型准确率。
2. 分词:对邮件内容进行分词处理,将一封邮件转化为一个词汇列表。
3. 特征提取:将每个邮件转化为一个向量,向量的每个元素对应一个特征。
4. 训练模型:使用训练集训练朴素贝叶斯模型,计算每个类别的先验概率和条件概率。
5. 测试模型:使用测试集测试模型准确率。
6. 应用模型:使用训练好的模型对新邮件进行分类。
代码实现如下:
```python
import jieba
import os
import random
import numpy as np
# 分词函数
def cut_words(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
words = jieba.cut(text)
return list(words)
# 获取停用词列表
def get_stopwords():
with open('stopwords.txt', 'r', encoding='utf-8') as f:
stopwords = f.readlines()
stopwords = [word.strip() for word in stopwords]
return stopwords
# 获取所有文件路径
def get_file_path(root_path):
file_path_list = []
for root, dirs, files in os.walk(root_path):
for file in files:
file_path_list.append(os.path.join(root, file))
return file_path_list
# 计算先验概率和条件概率
def train_NB(train_data, train_label):
# 先验概率
prior_prob = {}
total_num = len(train_label)
label_set = set(train_label)
for label in label_set:
prior_prob[label] = train_label.count(label) / total_num
# 条件概率
word_prob = {}
words_set = set([word for text in train_data for word in text])
for label in label_set:
word_prob[label] = {}
label_index = [i for i in range(len(train_label)) if train_label[i] == label]
label_text = [train_data[i] for i in label_index]
total_words_num = sum([len(text) for text in label_text])
for word in words_set:
word_prob[label][word] = (sum([text.count(word) for text in label_text]) + 1) / (total_words_num + len(words_set))
return prior_prob, word_prob
# 预测函数
def predict_NB(test_data, prior_prob, word_prob):
predict_label = []
label_set = list(prior_prob.keys())
for text in test_data:
max_prob = -np.inf
for label in label_set:
prob = np.log(prior_prob[label])
for word in text:
if word in word_prob[label]:
prob += np.log(word_prob[label][word])
if prob > max_prob:
max_prob = prob
predict = label
predict_label.append(predict)
return predict_label
# 交叉验证
def cross_validation(data, label, k):
data_num = len(label)
index = list(range(data_num))
random.shuffle(index)
fold_size = data_num // k
accuracy_list = []
for i in range(k):
start = i * fold_size
end = min((i + 1) * fold_size, data_num)
test_index = index[start:end]
train_index = list(set(index) - set(test_index))
train_data = [data[i] for i in train_index]
train_label = [label[i] for i in train_index]
test_data = [data[i] for i in test_index]
test_label = [label[i] for i in test_index]
prior_prob, word_prob = train_NB(train_data, train_label)
predict_label = predict_NB(test_data, prior_prob, word_prob)
accuracy = sum([1 for i in range(len(test_label)) if test_label[i] == predict_label[i]]) / len(test_label)
accuracy_list.append(accuracy)
return sum(accuracy_list) / k
if __name__ == '__main__':
# 获取停用词列表
stopwords = get_stopwords()
# 获取文件路径
file_path_list = get_file_path('data')
# 获取所有文本和对应标签
text_list = []
label_list = []
for file_path in file_path_list:
label = file_path.split('\\')[-2]
label_list.append(label)
words = cut_words(file_path)
words = [word for word in words if word not in stopwords]
text_list.append(words)
# 执行交叉验证
accuracy = cross_validation(text_list, label_list, 5)
print('模型准确率:%.2f%%' % (accuracy * 100))
```
阅读全文