单片机语言编写8个led灯闪烁三次

以下是基于朴素贝叶斯通过python实现垃圾邮件的分类的完整代码：代码如下： ```python import numpy as np from collections import Counter # 定义函数读取垃圾邮件和非垃圾邮件 def get_email_text(path): with open(path, 'r', encoding='ISO-8859-1') as file: lines = file.readlines() email = '' for line in lines: email += line return email # 定义分词函数 def spam_lis(email): symbols = "!\"#$%&()*+-./:;<=>?@[\\]^_`{|}~\n" for i in symbols: email = np.char.replace(email, i, ' ') email = np.char.lower(email) words = email.split(' ') words = list(filter(None, words)) return words # 获取停用词列表 def get_stopwords(): stopwords = [] with open('stopwords.txt','r',encoding='utf-8') as file: lines = file.readlines() for line in lines: stopwords.append(line.strip()) return stopwords # 定义函数获取各种概率 def make_dict(email_paths,spam_or_not): emails = [get_email_text(email_path) for email_path in email_paths] len_emails = len(emails) # 将所有电子邮件组合为一个字符串 all_text = ' '.join(emails) # 获取所有单词列表 words = spam_lis(all_text) # 获取停用词列表 stopwords = get_stopwords() words = [word for word in words if word not in stopwords] # 获取单词的计数，spam_or_not表示区分垃圾邮件和非垃圾邮件 count_word = Counter(words) #获取单词总数 total_count = len(words) # 获取出现单词的数量 count_email = len(emails) # 获取出现单词的数量的概率 p_spam = count_email/len_emails if spam_or_not else (len_emails-count_email)/len_emails word_dict = {} # 获取每个词的概率 for word,count in count_word.items(): # 假设单词在垃圾邮件中出现的次数为 n_spam，非垃圾邮件中出现的次数为 n_not_spam。 # p_spam_word 表示在垃圾邮件中出现该单词的概率，它等于 n_spam / (n_spam + n_not_spam)。 # p_not_spam_word 表示在非垃圾邮件中出现该单词的概率，它等于 n_not_spam / (n_spam + n_not_spam)。 # p_word_spam 表示垃圾邮件中出现该单词的概率，它等于 n_spam / N。 # p_word_not_spam 表示非垃圾邮件中出现该单词的概率，它等于 n_not_spam / N。 # p_word 表示在所有电子邮件中出现该单词的概率，它等于 (n_spam + n_not_spam) / N。 p_word_spam = count/count_email if spam_or_not else 0 p_word_not_spam = count/(len_emails-count_email) if not spam_or_not else 0 p_spam_word = p_word_spam / p_word_not_spam p_not_spam_word = p_word_not_spam / p_word_spam p_word = (count/total_count) word_dict[word] = [p_word_spam,p_not_spam_word,p_spam_word,p_not_spam_word,p_word] return word_dict,p_spam # 获取垃圾邮件和非垃圾邮件路径 spam_paths = ['email/spam/{}.txt'.format(i) for i in range(1,66)] ham_paths = ['email/ham/{}.txt'.format(i) for i in range(1,66)] all_paths = spam_paths + ham_paths # 提取特征词 spam_dict, p_spam = make_dict(spam_paths,True) ham_dict, p_ham = make_dict(ham_paths,False) # 定义预测函数 def classify(email_path,spam_dict,ham_dict,p_spam,p_ham): email = get_email_text(email_path) words = spam_lis(email) stopwords = get_stopwords() words = [word for word in words if word not in stopwords] # 初始化 p_spam_email = p_spam p_ham_email = p_ham for word in words: # 判断词汇表是否包括该单词 if word in spam_dict: p_word_spam,p_word_not_spam,p_spam_word,p_not_spam_word,p_word = spam_dict[word] p_spam_email *= p_word_spam p_ham_email *= p_word_not_spam elif word in ham_dict: p_word_spam,p_word_not_spam,p_spam_word,p_not_spam_word,p_word = ham_dict[word] p_spam_email *= p_spam_word p_ham_email *= p_not_spam_word # 如果词汇表中没有这个单词，可以选择忽略，或者将该单词的概率计算为0 return p_spam_email,p_ham_email # 测试一封邮件是否是垃圾邮件 email_path = 'email/spam/1.txt' p_spam_email,p_ham_email = classify(email_path,spam_dict,ham_dict,p_spam,p_ham) if p_spam_email > p_ham_email: print('此邮件是垃圾邮件') else: print('此邮件不是垃圾邮件') ``` 希望对你有用！

阅读全文

CSDN会员

开通CSDN年卡参与万元壕礼抽奖

海量 VIP免费资源千本正版电子书商城会员专享价千门课程&专栏

全年可省5,000元立即开通

最新推荐

单片机语言编写8个led灯闪烁三次

最新推荐

毕设和企业适用springboot企业健康管理平台类及活动管理平台源码+论文+视频.zip

GitHub图片浏览插件：直观展示代码中的图像

管理建模和仿真的文件

【OPPO手机故障诊断专家】：工程指令快速定位与解决

求[100，900]之间相差为12的素数对（注：要求素数对的两个素数均在该范围内）的个数

Android IPTV项目：直播频道的实时流媒体实现

"互动学习：行动中的多样性与论文攻读经历"

【OPPO手机工程模式终极指南】：掌握这些秘籍，故障排查不再难！

前端在json文件里写模板，可以换行 有空格现在在文本框的时候

机器学习在医院再入院率预测中的应用分析

前端在json文件里写模板，可以换行有空格现在在文本框的时候