时间: 2023-12-03 07:44:39 浏览: 99
5星 · 资源好评率100%
朴素贝叶斯算法是一种常用的文本分类算法,可以用于垃圾邮件分类。在进行垃圾邮件分类时,我们需要对邮件进行中文分词,并使用朴素贝叶斯算法进行分类。同时,为了评价分类效果,我们可以使用准确率、召回率和 F1 值等指标。
下面是使用 Python 实现朴素贝叶斯算法进行垃圾邮件分类的代码,并添加中文分词与评价指标的部分。
首先,我们需要加载数据集并进行中文分词。假设我们的数据集中包含了两个文件夹,一个是 spam 文件夹,一个是 ham 文件夹,分别存放了垃圾邮件和正常邮件。
import os
import jieba
def load_data(folder):
data = []
for filename in os.listdir(folder):
with open(os.path.join(folder, filename), 'r', encoding='utf-8') as f:
content = f.read()
content = jieba.cut(content) # 进行中文分词
data.append((' '.join(content), folder))
return data
spam_data = load_data('spam')
ham_data = load_data('ham')
接下来,我们需要将数据集划分为训练集和测试集。这里我们将数据集的 80% 作为训练集,20% 作为测试集。
import random
def split_data(data, prob):
train_data = []
test_data = []
for item in data:
if random.random() < prob:
return train_data, test_data
train_data = []
test_data = []
train_spam_data, test_spam_data = split_data(spam_data, 0.8)
train_ham_data, test_ham_data = split_data(ham_data, 0.8)
train_data = train_spam_data + train_ham_data
test_data = test_spam_data + test_ham_data
def count_words(data):
word_count = {}
spam_count = 0
ham_count = 0
for content, label in data:
words = content.split()
for word in words:
if label == 'spam':
spam_count += 1
ham_count += 1
if word not in word_count:
word_count[word] = {'spam': 0, 'ham': 0}
word_count[word][label] += 1
return word_count, spam_count, ham_count
word_count, spam_count, ham_count = count_words(train_data)
def word_probabilities(word_count, spam_count, ham_count, k=0.5):
probabilities = {}
total_words = len(word_count)
for word, count in word_count.items():
spam_prob = (count['spam'] + k) / (spam_count + 2 * k)
ham_prob = (count['ham'] + k) / (ham_count + 2 * k)
probabilities[word] = {'spam': spam_prob, 'ham': ham_prob}
return probabilities
word_probabilities = word_probabilities(word_count, spam_count, ham_count)
现在,我们可以使用上面的计算结果对测试集进行分类,并评价分类效果。这里我们使用准确率、召回率和 F1 值等指标进行评价。
def classify(content, word_probabilities, spam_prior=0.5):
words = content.split()
spam_prob = ham_prob = 1.0
for word in words:
if word in word_probabilities:
spam_prob *= word_probabilities[word]['spam']
ham_prob *= word_probabilities[word]['ham']
spam_prob *= spam_prior
ham_prob *= (1 - spam_prior)
if spam_prob > ham_prob:
return 'spam'
return 'ham'
def evaluate(test_data, word_probabilities):
true_positive = false_positive = false_negative = true_negative = 0
for content, label in test_data:
result = classify(content, word_probabilities)
if result == 'spam' and label == 'spam':
true_positive += 1
elif result == 'spam' and label == 'ham':
false_positive += 1
elif result == 'ham' and label == 'spam':
false_negative += 1
true_negative += 1
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1_score = 2 * precision * recall / (precision + recall)
accuracy = (true_positive + true_negative) / len(test_data)
return precision, recall, f1_score, accuracy
precision, recall, f1_score, accuracy = evaluate(test_data, word_probabilities)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1_score)
print('Accuracy:', accuracy)