用python编写朴素贝叶斯中文文本分类器,训练集和数据集五五分,并输出精确率、召回率和f值
时间: 2023-06-16 10:06:08 浏览: 98
python编写朴素贝叶斯用于文本分类
5星 · 资源好评率100%
好的,我可以给你提供一个基于Python实现的朴素贝叶斯中文文本分类器的代码,你可以在此基础上进行训练和测试。
```
import os
import jieba
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score
# 加载停用词
def load_stopwords(filename):
stopwords = set()
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
return stopwords
# 加载数据
def load_data(path):
all_data = []
for filename in os.listdir(path):
label = filename.split('.')[0]
with open(os.path.join(path, filename), 'r', encoding='utf-8') as f:
content = f.read().strip()
all_data.append((label, content))
return all_data
# 分词
def cut_words(data, stopwords):
all_words = []
for label, content in data:
words = [word for word in jieba.cut(content) if word not in stopwords]
all_words.extend(words)
return all_words
# 获取特征
def get_features(data, stopwords, feature_count):
word_count = {}
for label, content in data:
words = [word for word in jieba.cut(content) if word not in stopwords]
for word in words:
if word not in word_count:
word_count[word] = 0
word_count[word] += 1
word_count_list = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
features = [word for word, count in word_count_list[:feature_count]]
return features
# 特征向量化
def vectorize(data, features, stopwords):
vectors = []
for label, content in data:
words = [word for word in jieba.cut(content) if word not in stopwords]
vector = [words.count(feature) for feature in features]
vectors.append((label, vector))
return vectors
# 训练模型
def train_model(train_data, features):
X_train = [vector for label, vector in train_data]
y_train = [label for label, vector in train_data]
clf = MultinomialNB()
clf.fit(X_train, y_train)
return clf
# 测试模型
def test_model(test_data, clf):
X_test = [vector for label, vector in test_data]
y_test = [label for label, vector in test_data]
y_pred = clf.predict(X_test)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
return precision, recall, f1
if __name__ == '__main__':
# 加载停用词
stopwords = load_stopwords('stopwords.txt')
# 加载数据,进行分词
all_data = load_data('data')
random.shuffle(all_data)
train_data = all_data[:len(all_data) // 2]
test_data = all_data[len(all_data) // 2:]
all_words = cut_words(all_data, stopwords)
# 获取特征
features = get_features(train_data, stopwords, 5000)
# 特征向量化
train_vectors = vectorize(train_data, features, stopwords)
test_vectors = vectorize(test_data, features, stopwords)
# 训练模型
clf = train_model(train_vectors, features)
# 测试模型,输出精确率、召回率和f值
precision, recall, f1 = test_model(test_vectors, clf)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)
```
请注意,这是一个简单的中文文本分类器示例代码。如果要将其用于实际应用,还需要对其进行优化,比如使用更好的分词工具、增加特征数量、增加训练数据等等。
阅读全文