knn算法过滤邮件代码
时间: 2023-11-12 07:59:06 浏览: 74
以下是使用KNN算法过滤邮件的Python代码:
```python
import numpy as np
from collections import Counter
import re
def clean_text(text):
"""
将文本转换为小写并去除非字母字符
"""
text = text.lower()
text = re.sub(r'[^a-z]', ' ', text)
return text
def get_word_counts(text):
"""
计算文本中每个单词的出现次数
"""
words = text.split()
return Counter(words)
def get_cosine_similarity(a, b):
"""
计算两个向量之间的余弦相似度
"""
dot_product = np.dot(a, b)
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
return dot_product / (norm_a * norm_b)
class KNN:
def __init__(self, k=5):
self.k = k
def fit(self, X, y):
self.X = X
self.y = y
def predict(self, X):
y_pred = []
for x in X:
# 计算与训练集中每个样本的余弦相似度
similarities = []
for i in range(len(self.X)):
xi = self.X[i]
yi = self.y[i]
xi_counts = get_word_counts(clean_text(xi))
x_counts = get_word_counts(clean_text(x))
similarity = get_cosine_similarity(list(xi_counts.values()), list(x_counts.values()))
similarities.append((similarity, yi))
# 根据相似度排序,并选取前k个最相似的样本
similarities = sorted(similarities, key=lambda x: x[0], reverse=True)
k_neighbors = similarities[:self.k]
# 统计k个样本中类别出现的次数,选取出现次数最多的类别作为预测结果
k_neighbors_labels = [x[1] for x in k_neighbors]
most_common = Counter(k_neighbors_labels).most_common(1)
y_pred.append(most_common[0][0])
return y_pred
```
使用示例:
```python
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
# 加载20个新闻组数据集
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
# 将文本转换为TF-IDF向量
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 构建KNN分类器,并使用Pipeline进行封装
knn = KNN(k=5)
clf = Pipeline([
('vectorizer', vectorizer),
('knn', knn)
])
# 训练模型并预测测试集
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
```
在这个示例中,我们使用20个新闻组数据集进行了测试,使用TF-IDF向量化文本,并将KNN算法封装在Pipeline中。最终,我们得到了一个准确率为约0.54的模型。
阅读全文