时间: 2024-02-04 15:02:22 浏览: 94
import jieba
import os
# 垃圾邮件文件夹路径
spam_path = 'spam/'
# 非垃圾邮件文件夹路径
ham_path = 'ham/'
# 停用词文件路径
stopwords_path = 'stopwords.txt'
# 加载停用词
stopwords = set()
with open(stopwords_path, 'r', encoding='utf-8') as f:
for line in f:
# 获取所有邮件内容及标签
emails = []
for filename in os.listdir(spam_path):
with open(spam_path + filename, 'r', encoding='utf-8') as f:
content = f.read()
emails.append((content, 1))
for filename in os.listdir(ham_path):
with open(ham_path + filename, 'r', encoding='utf-8') as f:
content = f.read()
emails.append((content, 0))
# 分词,并去除停用词
def cut_words(content):
words = jieba.cut(content)
return [word for word in words if word not in stopwords]
# 统计词频
def count_words(words):
word_count = {}
for word in words:
word_count[word] = word_count.get(word, 0) + 1
return word_count
# 计算每个词在垃圾邮件和非垃圾邮件中的出现次数
spam_word_count = {}
ham_word_count = {}
spam_total_count = 0
ham_total_count = 0
for email in emails:
words = cut_words(email[0])
if email[1] == 1:
spam_total_count += 1
for word in words:
spam_word_count[word] = spam_word_count.get(word, 0) + 1
ham_total_count += 1
for word in words:
ham_word_count[word] = ham_word_count.get(word, 0) + 1
# 计算每个词在垃圾邮件和非垃圾邮件中的出现概率
p_word_spam = {}
p_word_ham = {}
for word in set(list(spam_word_count.keys()) + list(ham_word_count.keys())):
p_word_spam[word] = (spam_word_count.get(word, 0) + 1) / (spam_total_count + 2)
p_word_ham[word] = (ham_word_count.get(word, 0) + 1) / (ham_total_count + 2)
# 定义分类函数
def classify(content):
words = cut_words(content)
p_spam = 1
p_ham = 1
for word in words:
p_spam *= p_word_spam.get(word, 1)
p_ham *= p_word_ham.get(word, 1)
if p_spam > p_ham:
return '垃圾邮件'
return '非垃圾邮件'
# 测试分类函数
with open('test.txt', 'r', encoding='utf-8') as f:
content = f.read()
classification = classify(content)