可以帮我使用python编写基于朴素贝叶斯的垃圾邮件分类代码吗
时间: 2024-02-04 14:02:22 浏览: 65
当然可以,以下是一个简单的基于朴素贝叶斯的垃圾邮件分类代码:
```python
import os
import re
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# 读取文件并进行预处理
def read_file(file_path):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# 将非字母数字字符替换为空格
content = re.sub(r'[^a-zA-Z0-9]', ' ', content)
# 将所有字母转换为小写
content = content.lower()
# 将内容分词并返回
return content.split()
# 获取所有文件的单词列表和类别列表
def load_dataset():
spam_dir = 'spam'
ham_dir = 'ham'
files = []
labels = []
# 读取垃圾邮件
for file_name in os.listdir(spam_dir):
file_path = os.path.join(spam_dir, file_name)
files.append(read_file(file_path))
labels.append(1)
# 读取正常邮件
for file_name in os.listdir(ham_dir):
file_path = os.path.join(ham_dir, file_name)
files.append(read_file(file_path))
labels.append(0)
# 返回所有文件的单词列表和类别列表
return files, labels
# 创建单词词典
def create_vocab_list(dataset):
vocab_set = set([])
for document in dataset:
vocab_set = vocab_set | set(document)
return list(vocab_set)
# 将输入文档转换为向量
def document_to_vector(vocab_list, input_document):
# 创建一个和词汇表长度一样的向量,并将其元素都设置为 0
vector = [0] * len(vocab_list)
# 遍历文档中的所有单词,如果出现了词汇表中的单词,就将向量中的对应值设为 1
for word in input_document:
if word in vocab_list:
vector[vocab_list.index(word)] = 1
# 返回向量
return vector
# 训练朴素贝叶斯分类器
def train_NB(train_matrix, train_labels):
# 获取训练集中垃圾邮件的数量
num_spam = sum(train_labels)
# 计算训练集中每个单词在垃圾邮件和正常邮件中出现的次数以及垃圾邮件的数量
num_words = len(train_matrix[0])
p_spam = num_spam / float(len(train_labels))
p_word_given_spam = np.ones(num_words)
p_word_given_ham = np.ones(num_words)
p_word_given_spam_denominator = 2.0
p_word_given_ham_denominator = 2.0
for i in range(len(train_matrix)):
if train_labels[i] == 1:
p_word_given_spam += train_matrix[i]
p_word_given_spam_denominator += sum(train_matrix[i])
else:
p_word_given_ham += train_matrix[i]
p_word_given_ham_denominator += sum(train_matrix[i])
# 计算每个单词在垃圾邮件和正常邮件中出现的概率
p_word_given_spam = np.log(p_word_given_spam / p_word_given_spam_denominator)
p_word_given_ham = np.log(p_word_given_ham / p_word_given_ham_denominator)
# 返回模型
return p_word_given_spam, p_word_given_ham, p_spam
# 测试分类器
def test_NB(test_matrix, test_labels, p_word_given_spam, p_word_given_ham, p_spam):
# 使用训练好的模型进行预测
predictions = []
for i in range(len(test_matrix)):
# 计算文档的概率
p_spam_given_document = np.log(p_spam)
p_ham_given_document = np.log(1.0 - p_spam)
for j in range(len(test_matrix[i])):
if test_matrix[i][j] == 1:
p_spam_given_document += p_word_given_spam[j]
p_ham_given_document += p_word_given_ham[j]
# 将概率较大的类别作为预测结果
if p_spam_given_document > p_ham_given_document:
predictions.append(1)
else:
predictions.append(0)
# 计算分类器的准确率
return accuracy_score(test_labels, predictions)
# 主函数
if __name__ == '__main__':
# 加载数据集
documents, labels = load_dataset()
# 创建单词词典
vocab_list = create_vocab_list(documents)
# 将所有文档转换为向量
train_matrix = []
for document in documents:
train_matrix.append(document_to_vector(vocab_list, document))
# 划分训练集和测试集
split_index = int(len(train_matrix) * 0.7)
train_data = train_matrix[:split_index]
train_labels = labels[:split_index]
test_data = train_matrix[split_index:]
test_labels = labels[split_index:]
# 训练朴素贝叶斯分类器
p_word_given_spam, p_word_given_ham, p_spam = train_NB(train_data, train_labels)
# 测试分类器
accuracy = test_NB(test_data, test_labels, p_word_given_spam, p_word_given_ham, p_spam)
print('Accuracy:', accuracy)
```
在上面的代码中,我们首先读取所有文件并进行预处理,然后将所有文档转换为向量,并使用朴素贝叶斯算法训练一个分类器。最后,我们使用测试集评估分类器的准确率。
阅读全文