朴素贝叶斯python代码实例垃圾邮件
时间: 2023-07-25 16:10:52 浏览: 86
python垃圾邮件过滤朴素贝叶斯是经典的机器学习算法之一
当然,以下是一个基于Python的朴素贝叶斯垃圾邮件分类器的示例代码:
```python
import os
import io
import numpy as np
from collections import Counter
# 创建一个函数用于读取文件内容并返回一个单词列表
def get_words_in_file(file):
with io.open(file, 'r', encoding='utf-8') as f:
return f.read().split()
# 创建一个函数用于读取文件夹中所有的文件
def get_files_in_folder(folder):
files = []
for (dirpath, dirnames, filenames) in os.walk(folder):
for filename in filenames:
files.append(os.path.join(dirpath, filename))
return files
# 创建一个函数用于将单词列表转换为一个字典
def get_word_counts(words):
return dict(Counter(words))
# 创建一个函数用于计算每个类别(垃圾邮件和非垃圾邮件)中每个单词出现的次数
def calculate_class_word_counts(training_files):
class_word_counts = {}
for file in training_files:
if 'ham' in file:
if 'ham' not in class_word_counts:
class_word_counts['ham'] = {}
words = get_words_in_file(file)
word_counts = get_word_counts(words)
for word, count in word_counts.items():
if word not in class_word_counts['ham']:
class_word_counts['ham'][word] = 0
class_word_counts['ham'][word] += count
else:
if 'spam' not in class_word_counts:
class_word_counts['spam'] = {}
words = get_words_in_file(file)
word_counts = get_word_counts(words)
for word, count in word_counts.items():
if word not in class_word_counts['spam']:
class_word_counts['spam'][word] = 0
class_word_counts['spam'][word] += count
return class_word_counts
# 创建一个函数用于计算每个类别(垃圾邮件和非垃圾邮件)中所有单词的总数
def calculate_class_total_words(class_word_counts):
total_words = {'ham': 0, 'spam': 0}
for class_name, words in class_word_counts.items():
for word, count in words.items():
total_words[class_name] += count
return total_words
# 创建一个函数用于计算每个类别(垃圾邮件和非垃圾邮件)中每个单词的概率
def calculate_class_word_probabilities(class_word_counts, total_words):
class_word_probabilities = {}
for class_name, words in class_word_counts.items():
class_word_probabilities[class_name] = {}
for word, count in words.items():
class_word_probabilities[class_name][word] = count / total_words[class_name]
return class_word_probabilities
# 创建一个函数用于预测新邮件的类别
def predict_new_email_class(new_email_file, class_word_probabilities, total_words):
words = get_words_in_file(new_email_file)
word_counts = get_word_counts(words)
spam_probability = 1
ham_probability = 1
for word, count in word_counts.items():
if word in class_word_probabilities['spam']:
spam_probability *= (class_word_probabilities['spam'][word] ** count)
if word in class_word_probabilities['ham']:
ham_probability *= (class_word_probabilities['ham'][word] ** count)
spam_probability *= total_words['spam'] / (total_words['spam'] + total_words['ham'])
ham_probability *= total_words['ham'] / (total_words['spam'] + total_words['ham'])
if spam_probability > ham_probability:
return 'spam'
else:
return 'ham'
# 创建一个函数用于计算分类器的准确性
def calculate_classifier_accuracy(testing_files, class_word_probabilities, total_words):
num_correct = 0
for file in testing_files:
predicted_class = predict_new_email_class(file, class_word_probabilities, total_words)
if ('ham' in file and predicted_class == 'ham') or ('spam' in file and predicted_class == 'spam'):
num_correct += 1
return num_correct / len(testing_files)
# 读取训练数据和测试数据
training_files = get_files_in_folder('training')
testing_files = get_files_in_folder('testing')
# 计算每个类别(垃圾邮件和非垃圾邮件)中每个单词出现的次数
class_word_counts = calculate_class_word_counts(training_files)
# 计算每个类别(垃圾邮件和非垃圾邮件)中所有单词的总数
total_words = calculate_class_total_words(class_word_counts)
# 计算每个类别(垃圾邮件和非垃圾邮件)中每个单词的概率
class_word_probabilities = calculate_class_word_probabilities(class_word_counts, total_words)
# 计算分类器的准确性
accuracy = calculate_classifier_accuracy(testing_files, class_word_probabilities, total_words)
# 输出分类器的准确性
print('Accuracy:', accuracy)
```
在这个代码示例中,我们使用了Python的标准库来读取文件和处理文本数据。我们还使用了Python的Counter类来计算每个单词在文件中出现的次数,以及字典来存储每个类别中每个单词的出现次数和概率。最后,我们使用了准确性作为评估分类器的指标。
阅读全文