运用朴素贝叶斯分类垃圾邮件,不要用sklearn,nltk
时间: 2023-05-19 12:02:22 浏览: 86
可以使用Python中的numpy和pandas库来实现朴素贝叶斯分类垃圾邮件。具体步骤如下:
1. 读取邮件数据集,将数据集分为训练集和测试集。
2. 对训练集进行预处理,包括去除停用词、分词、统计词频等。
3. 计算每个单词在垃圾邮件和非垃圾邮件中出现的概率。
4. 对测试集进行分类,计算分类准确率。
以下是代码示例:
```python
import numpy as np
import pandas as pd
# 读取数据集
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]
data = data.rename(columns={'v1': 'label', 'v2': 'text'})
# 划分训练集和测试集
train_data = data.sample(frac=0.8, random_state=1)
test_data = data.drop(train_data.index)
# 预处理
def preprocess(text):
# 去除停用词
stopwords = ['a', 'an', 'the', 'in', 'on', 'at', 'to', 'of', 'for', 'by', 'with']
text = ' '.join([word for word in text.split() if word.lower() not in stopwords])
# 分词
words = text.split()
# 统计词频
word_count = {}
for word in words:
if word not in word_count:
word_count[word] = 0
word_count[word] += 1
return word_count
train_data['text'] = train_data['text'].apply(preprocess)
# 计算概率
spam_words = {}
ham_words = {}
spam_count = 0
ham_count = 0
for i, row in train_data.iterrows():
if row['label'] == 'spam':
spam_count += 1
for word, count in row['text'].items():
if word not in spam_words:
spam_words[word] = 0
spam_words[word] += count
else:
ham_count += 1
for word, count in row['text'].items():
if word not in ham_words:
ham_words[word] = 0
ham_words[word] += count
spam_total_words = sum(spam_words.values())
ham_total_words = sum(ham_words.values())
spam_prob = spam_count / len(train_data)
ham_prob = ham_count / len(train_data)
# 分类
def classify(text):
word_count = preprocess(text)
spam_score = np.log(spam_prob)
ham_score = np.log(ham_prob)
for word, count in word_count.items():
if word in spam_words:
spam_score += np.log((spam_words[word] + 1) / (spam_total_words + len(spam_words)))
else:
spam_score += np.log(1 / (spam_total_words + len(spam_words)))
if word in ham_words:
ham_score += np.log((ham_words[word] + 1) / (ham_total_words + len(ham_words)))
else:
ham_score += np.log(1 / (ham_total_words + len(ham_words)))
if spam_score > ham_score:
return 'spam'
else:
return 'ham'
test_data['predicted'] = test_data['text'].apply(classify)
# 计算准确率
accuracy = (test_data['label'] == test_data['predicted']).mean()
print('Accuracy:', accuracy)
```
注意:这只是一个简单的实现,实际应用中还需要进行更多的优化和改进。
阅读全文