基于朴素贝叶斯实现垃圾邮件分类,不用sklearn和nltk,且提供25个测试集和训练集
时间: 2023-05-18 21:07:05 浏览: 77
非常感谢您的问题。基于朴素贝叶斯实现垃圾邮件分类的代码如下:
```
import os
import re
import math
# 读取文件内容
def read_file(path):
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
return content
# 分词
def cut_words(content):
words = re.findall(r'\w+', content)
return words
# 计算词频
def calc_word_freq(words):
freq = {}
for word in words:
freq[word] = freq.get(word, 0) + 1
return freq
# 计算每个类别的词频
def calc_class_word_freq(train_dir):
class_word_freq = {}
for root, dirs, files in os.walk(train_dir):
for file in files:
class_name = os.path.basename(root)
file_path = os.path.join(root, file)
content = read_file(file_path)
words = cut_words(content)
freq = calc_word_freq(words)
if class_name not in class_word_freq:
class_word_freq[class_name] = {}
for word, count in freq.items():
if word not in class_word_freq[class_name]:
class_word_freq[class_name][word] = 0
class_word_freq[class_name][word] += count
return class_word_freq
# 计算每个类别的文档数
def calc_class_doc_count(train_dir):
class_doc_count = {}
for root, dirs, files in os.walk(train_dir):
for file in files:
class_name = os.path.basename(root)
if class_name not in class_doc_count:
class_doc_count[class_name] = 0
class_doc_count[class_name] += 1
return class_doc_count
# 计算每个类别的先验概率
def calc_class_prior_prob(class_doc_count, total_doc_count):
class_prior_prob = {}
for class_name, doc_count in class_doc_count.items():
class_prior_prob[class_name] = doc_count / total_doc_count
return class_prior_prob
# 计算每个类别的条件概率
def calc_class_cond_prob(class_word_freq, class_word_count, vocab_size, alpha=1.0):
class_cond_prob = {}
for class_name, word_freq in class_word_freq.items():
class_cond_prob[class_name] = {}
for word, count in word_freq.items():
class_cond_prob[class_name][word] = (count + alpha) / (class_word_count[class_name] + alpha * vocab_size)
return class_cond_prob
# 训练模型
def train(train_dir):
class_word_freq = calc_class_word_freq(train_dir)
class_doc_count = calc_class_doc_count(train_dir)
total_doc_count = sum(class_doc_count.values())
class_prior_prob = calc_class_prior_prob(class_doc_count, total_doc_count)
vocab = set()
for word_freq in class_word_freq.values():
vocab |= set(word_freq.keys())
vocab_size = len(vocab)
class_word_count = {}
for class_name, word_freq in class_word_freq.items():
class_word_count[class_name] = sum(word_freq.values())
class_cond_prob = calc_class_cond_prob(class_word_freq, class_word_count, vocab_size)
return class_prior_prob, class_cond_prob, vocab
# 预测
def predict(content, class_prior_prob, class_cond_prob, vocab):
words = cut_words(content)
score = {}
for class_name, prior_prob in class_prior_prob.items():
score[class_name] = math.log(prior_prob)
for word in words:
if word not in vocab:
continue
score[class_name] += math.log(class_cond_prob[class_name][word])
return max(score, key=score.get)
# 测试
def test(test_dir, class_prior_prob, class_cond_prob, vocab):
correct_count = 0
total_count = 0
for root, dirs, files in os.walk(test_dir):
for file in files:
class_name = os.path.basename(root)
file_path = os.path.join(root, file)
content = read_file(file_path)
pred_class_name = predict(content, class_prior_prob, class_cond_prob, vocab)
if pred_class_name == class_name:
correct_count += 1
total_count += 1
accuracy = correct_count / total_count
return accuracy
# 主函数
def main(train_dir, test_dir):
class_prior_prob, class_cond_prob, vocab = train(train_dir)
accuracy = test(test_dir, class_prior_prob, class_cond_prob, vocab)
print('Accuracy:', accuracy)
if __name__ == '__main__':
train_dir = 'train'
test_dir = 'test'
main(train_dir, test_dir)
```
其中,`train_dir`和`test_dir`分别是训练集和测试集所在的目录。在训练集和测试集目录下,每个子目录的名称表示一个类别,子目录下的文件是该类别的样本。在本代码中,使用了 Laplace 平滑,即将每个词出现的次数加上一个常数 alpha,避免出现概率为 0 的情况。在测试时,使用了最大化后验概率来预测类别。最后,输出了分类准确率。
关于提供25个测试集和训练集,由于我是一个 AI,没有实际的文件系统,所以无法提供这些数据集。但是,您可以自己准备这些数据集,并将它们放在相应的目录下,然后运行上述代码进行训练和测试。
阅读全文