单片机语言编写8个led灯闪烁三次
时间: 2023-05-21 11:04:53 浏览: 119
以下是基于朴素贝叶斯通过python实现垃圾邮件的分类的完整代码:
代码如下:
```python
import numpy as np
from collections import Counter
# 定义函数读取垃圾邮件和非垃圾邮件
def get_email_text(path):
with open(path, 'r', encoding='ISO-8859-1') as file:
lines = file.readlines()
email = ''
for line in lines:
email += line
return email
# 定义分词函数
def spam_lis(email):
symbols = "!\"#$%&()*+-./:;<=>?@[\\]^_`{|}~\n"
for i in symbols:
email = np.char.replace(email, i, ' ')
email = np.char.lower(email)
words = email.split(' ')
words = list(filter(None, words))
return words
# 获取停用词列表
def get_stopwords():
stopwords = []
with open('stopwords.txt','r',encoding='utf-8') as file:
lines = file.readlines()
for line in lines:
stopwords.append(line.strip())
return stopwords
# 定义函数获取各种概率
def make_dict(email_paths,spam_or_not):
emails = [get_email_text(email_path) for email_path in email_paths]
len_emails = len(emails)
# 将所有电子邮件组合为一个字符串
all_text = ' '.join(emails)
# 获取所有单词列表
words = spam_lis(all_text)
# 获取停用词列表
stopwords = get_stopwords()
words = [word for word in words if word not in stopwords]
# 获取单词的计数,spam_or_not表示区分垃圾邮件和非垃圾邮件
count_word = Counter(words)
#获取单词总数
total_count = len(words)
# 获取出现单词的数量
count_email = len(emails)
# 获取出现单词的数量的概率
p_spam = count_email/len_emails if spam_or_not else (len_emails-count_email)/len_emails
word_dict = {}
# 获取每个词的概率
for word,count in count_word.items():
# 假设单词在垃圾邮件中出现的次数为 n_spam,非垃圾邮件中出现的次数为 n_not_spam。
# p_spam_word 表示在垃圾邮件中出现该单词的概率,它等于 n_spam / (n_spam + n_not_spam)。
# p_not_spam_word 表示在非垃圾邮件中出现该单词的概率,它等于 n_not_spam / (n_spam + n_not_spam)。
# p_word_spam 表示垃圾邮件中出现该单词的概率,它等于 n_spam / N。
# p_word_not_spam 表示非垃圾邮件中出现该单词的概率,它等于 n_not_spam / N。
# p_word 表示在所有电子邮件中出现该单词的概率,它等于 (n_spam + n_not_spam) / N。
p_word_spam = count/count_email if spam_or_not else 0
p_word_not_spam = count/(len_emails-count_email) if not spam_or_not else 0
p_spam_word = p_word_spam / p_word_not_spam
p_not_spam_word = p_word_not_spam / p_word_spam
p_word = (count/total_count)
word_dict[word] = [p_word_spam,p_not_spam_word,p_spam_word,p_not_spam_word,p_word]
return word_dict,p_spam
# 获取垃圾邮件和非垃圾邮件路径
spam_paths = ['email/spam/{}.txt'.format(i) for i in range(1,66)]
ham_paths = ['email/ham/{}.txt'.format(i) for i in range(1,66)]
all_paths = spam_paths + ham_paths
# 提取特征词
spam_dict, p_spam = make_dict(spam_paths,True)
ham_dict, p_ham = make_dict(ham_paths,False)
# 定义预测函数
def classify(email_path,spam_dict,ham_dict,p_spam,p_ham):
email = get_email_text(email_path)
words = spam_lis(email)
stopwords = get_stopwords()
words = [word for word in words if word not in stopwords]
# 初始化
p_spam_email = p_spam
p_ham_email = p_ham
for word in words:
# 判断词汇表是否包括该单词
if word in spam_dict:
p_word_spam,p_word_not_spam,p_spam_word,p_not_spam_word,p_word = spam_dict[word]
p_spam_email *= p_word_spam
p_ham_email *= p_word_not_spam
elif word in ham_dict:
p_word_spam,p_word_not_spam,p_spam_word,p_not_spam_word,p_word = ham_dict[word]
p_spam_email *= p_spam_word
p_ham_email *= p_not_spam_word
# 如果词汇表中没有这个单词,可以选择忽略,或者将该单词的概率计算为0
return p_spam_email,p_ham_email
# 测试一封邮件是否是垃圾邮件
email_path = 'email/spam/1.txt'
p_spam_email,p_ham_email = classify(email_path,spam_dict,ham_dict,p_spam,p_ham)
if p_spam_email > p_ham_email:
print('此邮件是垃圾邮件')
else:
print('此邮件不是垃圾邮件')
```
希望对你有用!
阅读全文