使用朴素贝叶斯算法来进行垃圾邮件分类完整代码
时间: 2023-07-05 13:33:17 浏览: 143
利用贝叶斯算法实现垃圾邮件分类
好的,以下是使用朴素贝叶斯算法进行垃圾邮件分类的完整代码,供您参考。
```python
import os
import random
import math
from collections import defaultdict
# 定义停用词列表
stopwords = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
'to', 'was', 'were', 'will', 'with']
# 定义数据集路径
data_path = 'data'
# 定义分类类别
categories = ['ham', 'spam']
# 定义训练数据和测试数据的比例
train_ratio = 0.8
# 定义词袋模型
bag_of_words = defaultdict(int)
# 定义类别计数
category_count = defaultdict(int)
# 定义条件计数
condition_count = defaultdict(lambda: defaultdict(int))
# 定义训练集和测试集
train_data = []
test_data = []
def load_data():
"""
加载数据集
"""
for category in categories:
folder_path = os.path.join(data_path, category)
files = os.listdir(folder_path)
for file_name in files:
file_path = os.path.join(folder_path, file_name)
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
data = {'category': category, 'content': content}
if random.random() < train_ratio:
train_data.append(data)
else:
test_data.append(data)
def preprocess_data(data):
"""
预处理数据
"""
words = []
content = data['content']
for word in content.split():
word = word.strip().lower()
if word.isalpha() and word not in stopwords:
words.append(word)
return {'category': data['category'], 'words': words}
def train():
"""
训练模型
"""
for data in train_data:
preprocessed_data = preprocess_data(data)
category = preprocessed_data['category']
words = preprocessed_data['words']
category_count[category] += 1
for word in words:
bag_of_words[word] += 1
condition_count[category][word] += 1
def predict(data):
"""
预测分类
"""
words = preprocess_data(data)['words']
scores = {category: math.log(category_count[category] / len(train_data)) for category in categories}
for word in words:
if word in bag_of_words:
for category in categories:
score = math.log(condition_count[category][word] + 1) - math.log(bag_of_words[word] + len(bag_of_words))
scores[category] += score
return max(scores, key=scores.get)
def evaluate():
"""
评估模型
"""
correct = 0
for data in test_data:
predicted_category = predict(data)
if predicted_category == data['category']:
correct += 1
accuracy = correct / len(test_data)
print(f"Accuracy: {accuracy:.2f}")
if __name__ == '__main__':
load_data()
train()
evaluate()
```
以上就是使用朴素贝叶斯算法进行垃圾邮件分类的完整代码,代码中包括数据加载、数据预处理、模型训练、预测分类和模型评估等步骤。
阅读全文