用python写朴素贝叶斯文本分类
时间: 2023-10-01 17:09:25 浏览: 219
好的,以下是一个用 Python 编写朴素贝叶斯文本分类的示例代码:
```python
import os
import re
import math
import random
def tokenize(text):
"""
对文本进行分词(这里简单的按空格分割)
"""
return text.split()
def load_documents_from_dir(dir_path, label):
"""
从指定目录中加载指定标签的所有文档
"""
documents = []
for file_name in os.listdir(dir_path):
with open(os.path.join(dir_path, file_name), 'r', encoding='utf-8') as f:
text = f.read()
tokens = tokenize(text)
documents.append((tokens, label))
return documents
def load_documents(dirs):
"""
从多个指定目录中加载所有文档(每个目录都对应一个标签)
"""
documents = []
for dir_path, label in dirs:
documents += load_documents_from_dir(dir_path, label)
return documents
def count_words(documents):
"""
统计所有文档中每个词出现的次数,以及每个标签中所有词的总数
"""
word_counts = {}
label_word_counts = {}
for tokens, label in documents:
if label not in label_word_counts:
label_word_counts[label] = 0
label_word_counts[label] += len(tokens)
for token in tokens:
if token not in word_counts:
word_counts[token] = {}
if label not in word_counts[token]:
word_counts[token][label] = 0
word_counts[token][label] += 1
return word_counts, label_word_counts
def train(documents):
"""
训练朴素贝叶斯分类器
"""
word_counts, label_word_counts = count_words(documents)
vocabulary_size = len(word_counts)
labels = set(label_word_counts.keys())
prior_probabilities = {}
conditional_probabilities = {}
for label in labels:
prior_probabilities[label] = label_word_counts[label] / len(documents)
conditional_probabilities[label] = {}
for word in word_counts:
if label in word_counts[word]:
count = word_counts[word][label]
else:
count = 0
conditional_probabilities[label][word] = (count + 1) / (label_word_counts[label] + vocabulary_size)
return prior_probabilities, conditional_probabilities
def predict(tokens, prior_probabilities, conditional_probabilities):
"""
预测文本的标签
"""
scores = {}
for label in prior_probabilities:
score = math.log(prior_probabilities[label])
for token in tokens:
if token in conditional_probabilities[label]:
score += math.log(conditional_probabilities[label][token])
scores[label] = score
best_label = max(scores, key=scores.get)
return best_label
if __name__ == '__main__':
# 加载训练数据
train_dirs = [
('./pos', 'pos'),
('./neg', 'neg')
]
train_documents = load_documents(train_dirs)
# 训练模型
prior_probabilities, conditional_probabilities = train(train_documents)
# 随机选取10个测试文档进行测试
test_dirs = [
('./pos_test', 'pos'),
('./neg_test', 'neg')
]
test_documents = load_documents(test_dirs)
test_documents = random.sample(test_documents, 10)
# 对测试文档进行预测,并打印结果
for tokens, true_label in test_documents:
predicted_label = predict(tokens, prior_probabilities, conditional_probabilities)
print('true label:', true_label)
print('predicted label:', predicted_label)
```
这个示例程序实现了一个简单的朴素贝叶斯文本分类器,使用两个目录(./pos和./neg)中的文本作为训练数据,其中./pos目录下的文本被标记为"pos",./neg目录下的文本被标记为"neg"。
程序首先通过load_documents函数加载所有的训练文本,然后通过train函数训练朴素贝叶斯分类器,最后从./pos_test和./neg_test目录中随机选取10个测试文档进行测试。在每个测试文档预测完标签后,程序会将真实标签和预测标签打印出来。
阅读全文