朴素贝叶斯python代码实例垃圾邮件
时间: 2023-07-25 09:10:51 浏览: 45
好的,下面是一个朴素贝叶斯分类器的 Python 代码实例,用于分类垃圾邮件和非垃圾邮件:
```python
import os
import io
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 加载数据集
def load_data(path):
data = []
targets = []
for filename in os.listdir(path):
with io.open(os.path.join(path, filename), 'r', encoding='utf-8', errors='ignore') as f:
data.append(f.read())
if 'spm' in filename:
targets.append(1) # 垃圾邮件
else:
targets.append(0) # 非垃圾邮件
return data, targets
# 将邮件转换为词频向量
def vectorize(data):
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)
return X.toarray(), vectorizer.vocabulary_
# 训练模型
def train(X_train, y_train):
clf = MultinomialNB()
clf.fit(X_train, y_train)
return clf
# 测试模型
def test(clf, X_test, y_test):
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
return accuracy, precision, recall, f1
# 主函数
if __name__ == '__main__':
# 加载数据集
data, targets = load_data('./data')
# 将数据集分为训练集和测试集
split_point = int(0.8 * len(data))
X_train, y_train = vectorize(data[:split_point])
X_test, y_test = vectorize(data[split_point:])
# 训练模型
clf = train(X_train, y_train)
# 测试模型
accuracy, precision, recall, f1 = test(clf, X_test, y_test)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
```
在这个代码实例中,我们使用了 `CountVectorizer` 将邮件转换为词频向量,然后使用 `MultinomialNB` 训练朴素贝叶斯分类器,并测试模型的准确率、精确率、召回率和 F1 Score。