朴素贝叶斯邮件分类代码
时间: 2023-08-15 13:05:30 浏览: 121
下面是一个基于Python的朴素贝叶斯邮件分类代码示例:
```python
import os
import math
import string
import collections
# 停用词列表,可以根据需要扩展
stopwords = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with']
# 计算单词在文档中出现的次数
def get_word_counts(text):
# 去除标点符号并转换为小写
text = text.translate(str.maketrans('', '', string.punctuation)).lower()
# 按空格分割单词
words = text.split()
# 计算单词出现次数
word_counts = collections.defaultdict(int)
for word in words:
if word not in stopwords:
word_counts[word] += 1
return word_counts
# 训练朴素贝叶斯模型
def train():
# 统计垃圾和非垃圾邮件的数量
spam_count = 0
ham_count = 0
for filename in os.listdir('spam'):
if filename.endswith('.txt'):
spam_count += 1
for filename in os.listdir('ham'):
if filename.endswith('.txt'):
ham_count += 1
# 计算垃圾和非垃圾邮件的概率
spam_prob = spam_count / (spam_count + ham_count)
ham_prob = ham_count / (spam_count + ham_count)
# 统计垃圾和非垃圾邮件中单词出现的次数
spam_word_counts = collections.defaultdict(int)
ham_word_counts = collections.defaultdict(int)
for filename in os.listdir('spam'):
if filename.endswith('.txt'):
with open(os.path.join('spam', filename), 'r', encoding='utf-8') as f:
text = f.read()
word_counts = get_word_counts(text)
for word, count in word_counts.items():
spam_word_counts[word] += count
for filename in os.listdir('ham'):
if filename.endswith('.txt'):
with open(os.path.join('ham', filename), 'r', encoding='utf-8') as f:
text = f.read()
word_counts = get_word_counts(text)
for word, count in word_counts.items():
ham_word_counts[word] += count
# 计算单词在垃圾和非垃圾邮件中出现的概率
spam_word_probs = collections.defaultdict(float)
ham_word_probs = collections.defaultdict(float)
spam_total_words = sum(spam_word_counts.values())
ham_total_words = sum(ham_word_counts.values())
for word in spam_word_counts:
spam_word_probs[word] = spam_word_counts[word] / spam_total_words
for word in ham_word_counts:
ham_word_probs[word] = ham_word_counts[word] / ham_total_words
return spam_prob, ham_prob, spam_word_probs, ham_word_probs
# 使用朴素贝叶斯模型进行分类
def classify(text, spam_prob, ham_prob, spam_word_probs, ham_word_probs):
word_counts = get_word_counts(text)
spam_score = math.log(spam_prob)
ham_score = math.log(ham_prob)
for word, count in word_counts.items():
if word in spam_word_probs:
spam_score += count * math.log(spam_word_probs[word])
if word in ham_word_probs:
ham_score += count * math.log(ham_word_probs[word])
if spam_score > ham_score:
return 'spam'
else:
return 'ham'
# 测试分类器
def test(spam_prob, ham_prob, spam_word_probs, ham_word_probs):
correct = 0
total = 0
for filename in os.listdir('spam'):
if filename.endswith('.txt'):
with open(os.path.join('spam', filename), 'r', encoding='utf-8') as f:
text = f.read()
if classify(text, spam_prob, ham_prob, spam_word_probs, ham_word_probs) == 'spam':
correct += 1
total += 1
for filename in os.listdir('ham'):
if filename.endswith('.txt'):
with open(os.path.join('ham', filename), 'r', encoding='utf-8') as f:
text = f.read()
if classify(text, spam_prob, ham_prob, spam_word_probs, ham_word_probs) == 'ham':
correct += 1
total += 1
print('Accuracy: {:.2%}'.format(correct / total))
if __name__ == '__main__':
spam_prob, ham_prob, spam_word_probs, ham_word_probs = train()
test(spam_prob, ham_prob, spam_word_probs, ham_word_probs)
```
在这个示例中,我们将垃圾邮件和非垃圾邮件的文本分别保存在两个文件夹 `spam` 和 `ham` 中,每个文件都是一个纯文本文件。`get_word_counts` 函数用于计算单词出现的次数,并去除停用词。`train` 函数用于训练朴素贝叶斯模型,计算垃圾和非垃圾邮件的概率以及单词在垃圾和非垃圾邮件中出现的概率。`classify` 函数用于使用训练好的模型进行分类,根据单词在文本中出现的次数和单词在垃圾和非垃圾邮件中出现的概率计算出垃圾邮件和非垃圾邮件的得分。`test` 函数用于测试分类器的准确率。最后,我们调用 `train` 函数训练模型,并调用 `test` 函数测试分类器的准确率。
阅读全文