朴素贝叶斯垃圾邮件文档识别代码
时间: 2023-09-28 18:13:40 浏览: 28
以下是一个简单的Python代码实现朴素贝叶斯垃圾邮件文档识别:
```python
import os
import math
# 计算词频
def count_words(file_path):
with open(file_path, 'r') as f:
words = f.read().split()
word_count = {}
for word in words:
if word not in word_count:
word_count[word] = 0
word_count[word] += 1
return word_count
# 计算每个类别的词频和文档数
def train(train_dir):
# 统计正常邮件和垃圾邮件的文档数
normal_count = 0
spam_count = 0
for file_name in os.listdir(train_dir):
if "ham" in file_name:
normal_count += 1
else:
spam_count += 1
# 计算正常邮件和垃圾邮件的词频
normal_word_count = {}
spam_word_count = {}
for file_name in os.listdir(train_dir):
file_path = os.path.join(train_dir, file_name)
if "ham" in file_name:
word_count = count_words(file_path)
for word in word_count:
if word not in normal_word_count:
normal_word_count[word] = 0
normal_word_count[word] += word_count[word]
else:
word_count = count_words(file_path)
for word in word_count:
if word not in spam_word_count:
spam_word_count[word] = 0
spam_word_count[word] += word_count[word]
# 计算每个类别的词频和文档数
normal_word_prob = {}
spam_word_prob = {}
for word in normal_word_count:
normal_word_prob[word] = (normal_word_count[word] + 1) / (sum(normal_word_count.values()) + len(normal_word_count))
for word in spam_word_count:
spam_word_prob[word] = (spam_word_count[word] + 1) / (sum(spam_word_count.values()) + len(spam_word_count))
# 计算正常邮件和垃圾邮件的概率
normal_prob = normal_count / (normal_count + spam_count)
spam_prob = spam_count / (normal_count + spam_count)
return normal_word_prob, spam_word_prob, normal_prob, spam_prob
# 预测
def predict(file_path, normal_word_prob, spam_word_prob, normal_prob, spam_prob):
word_count = count_words(file_path)
normal_score = math.log(normal_prob)
spam_score = math.log(spam_prob)
for word in word_count:
if word in normal_word_prob:
normal_score += math.log(normal_word_prob[word]) * word_count[word]
else:
normal_score += math.log(1 / (sum(normal_word_prob.values()) + len(normal_word_prob))) * word_count[word]
if word in spam_word_prob:
spam_score += math.log(spam_word_prob[word]) * word_count[word]
else:
spam_score += math.log(1 / (sum(spam_word_prob.values()) + len(spam_word_prob))) * word_count[word]
return "spam" if spam_score > normal_score else "ham"
```
使用方法:
```python
train_dir = "path/to/train/dir"
normal_word_prob, spam_word_prob, normal_prob, spam_prob = train(train_dir)
file_path = "path/to/test/file"
result = predict(file_path, normal_word_prob, spam_word_prob, normal_prob, spam_prob)
print(result)
```
其中,`train_dir`是训练集文件夹的路径,训练集文件夹中应包含正常邮件和垃圾邮件两类邮件,每类邮件应分别放在以"ham"和"spam"命名的文件夹中。`file_path`是测试文件的路径,`result`为预测结果,为"ham"或"spam"。
相关推荐
![rar](https://img-home.csdnimg.cn/images/20210720083606.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)