帮我用numpy库写一个基于朴素贝叶斯算法的垃圾邮件分类
时间: 2023-05-13 20:07:13 浏览: 165
基于朴素贝叶斯的垃圾邮件分类
5星 · 资源好评率100%
当然可以!以下是一个基于朴素贝叶斯算法的垃圾邮件分类器的示例代码,使用了numpy库:
```python
import numpy as np
class NaiveBayes:
def __init__(self):
self.vocab = set() # 词汇表
self.labels = set() # 标签集合
self.label_word_count = {} # 每个标签下每个单词出现的次数
self.label_doc_count = {} # 每个标签下文档数量
self.doc_count = 0 # 文档总数
def fit(self, X, y):
self.doc_count = len(X)
for i in range(self.doc_count):
label = y[i]
self.labels.add(label)
if label not in self.label_word_count:
self.label_word_count[label] = {}
if label not in self.label_doc_count:
self.label_doc_count[label] = 0
self.label_doc_count[label] += 1
for word in X[i]:
self.vocab.add(word)
if word not in self.label_word_count[label]:
self.label_word_count[label][word] = 0
self.label_word_count[label][word] += 1
def predict(self, X):
y_pred = []
for doc in X:
max_prob = -np.inf
arg_max_label = None
for label in self.labels:
prob = np.log(self.label_doc_count[label]) - np.log(self.doc_count)
for word in doc:
if word in self.vocab:
prob += np.log(self.label_word_count[label][word] + 1) - np.log(sum(self.label_word_count[label].values()) + len(self.vocab))
if prob > max_prob:
max_prob = prob
arg_max_label = label
y_pred.append(arg_max_label)
return y_pred
```
使用方法:
```python
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
# 加载数据集
newsgroups = fetch_20newsgroups(subset='all')
X, y = newsgroups.data, newsgroups.target
# 分词
stop_words = set(stopwords.words('english'))
vectorizer = CountVectorizer(stop_words=stop_words)
X = [vectorizer.build_analyzer()(doc) for doc in X]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练模型
nb = NaiveBayes()
nb.fit(X_train, y_train)
# 预测并计算准确率
y_pred = nb.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)
```
这个示例代码使用了sklearn中的20个新闻组数据集,使用CountVectorizer进行分词,然后使用我们实现的朴素贝叶斯分类器进行分类。
阅读全文