帮我生成一个用朴素贝叶斯方法分类垃圾邮件的代码
时间: 2024-05-05 18:18:44 浏览: 94
当然可以,以下是一个简单的朴素贝叶斯分类器的代码:
```python
import os
import math
class NaiveBayesClassifier:
def __init__(self, spam_dir, ham_dir):
self.spam_dir = spam_dir
self.ham_dir = ham_dir
self.vocabulary = set()
self.spam_word_count = {}
self.ham_word_count = {}
self.spam_total_words = 0
self.ham_total_words = 0
self.spam_total_files = 0
self.ham_total_files = 0
def train(self):
for filename in os.listdir(self.spam_dir):
with open(os.path.join(self.spam_dir, filename), 'r', encoding='utf8', errors='ignore') as file:
words = file.read().split()
self.spam_total_files += 1
self.spam_total_words += len(words)
for word in words:
self.vocabulary.add(word)
if word not in self.spam_word_count:
self.spam_word_count[word] = 1
else:
self.spam_word_count[word] += 1
for filename in os.listdir(self.ham_dir):
with open(os.path.join(self.ham_dir, filename), 'r', encoding='utf8', errors='ignore') as file:
words = file.read().split()
self.ham_total_files += 1
self.ham_total_words += len(words)
for word in words:
self.vocabulary.add(word)
if word not in self.ham_word_count:
self.ham_word_count[word] = 1
else:
self.ham_word_count[word] += 1
def classify(self, message):
spam_score = math.log(self.spam_total_files / (self.spam_total_files + self.ham_total_files))
ham_score = math.log(self.ham_total_files / (self.spam_total_files + self.ham_total_files))
for word in message.split():
if word in self.vocabulary:
spam_score += math.log((self.spam_word_count.get(word, 0) + 1) / (self.spam_total_words + len(self.vocabulary)))
ham_score += math.log((self.ham_word_count.get(word, 0) + 1) / (self.ham_total_words + len(self.vocabulary)))
return 'spam' if spam_score > ham_score else 'ham'
# Example usage
classifier = NaiveBayesClassifier('path/to/spam/folder', 'path/to/ham/folder')
classifier.train()
print(classifier.classify('Hello, this is a spam email'))
print(classifier.classify('Hello, this is a legitimate email'))
```
以上代码是一个简单的垃圾邮件分类器,它使用了朴素贝叶斯分类算法。在训练阶段,它会读取垃圾邮件和正常邮件的文件夹,并统计每个单词在垃圾邮件和正常邮件中出现的次数。在分类阶段,它会根据每个单词在垃圾邮件和正常邮件中出现的概率,计算出一个垃圾邮件得分和一个正常邮件得分,并返回得分较高的那个标签。
阅读全文