中文文本分类python代码
时间: 2023-08-01 22:14:23 浏览: 51
以下是一个简单的中文文本分类的 Python 代码示例,使用了朴素贝叶斯分类器:
```python
import jieba
import os
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
# 读取数据集
def read_dataset(folder_path):
data = []
labels = []
for root, dirs, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
data.append(' '.join(jieba.cut(content)))
labels.append(root.split('\\')[-1])
return data, labels
# 分割数据集
def split_dataset(data, labels, test_ratio=0.2):
data_size = len(data)
index = list(range(data_size))
random.shuffle(index)
test_size = int(data_size * test_ratio)
test_index = index[:test_size]
train_index = index[test_size:]
train_data = [data[i] for i in train_index]
train_labels = [labels[i] for i in train_index]
test_data = [data[i] for i in test_index]
test_labels = [labels[i] for i in test_index]
return train_data, train_labels, test_data, test_labels
# 特征提取
def feature_extraction(train_data, test_data):
vectorizer = TfidfVectorizer(max_features=3000)
train_features = vectorizer.fit_transform(train_data)
test_features = vectorizer.transform(test_data)
return train_features, test_features
# 训练模型
def train_model(train_features, train_labels):
classifier = MultinomialNB(alpha=0.01)
classifier.fit(train_features, train_labels)
return classifier
# 预测
def predict(classifier, test_features):
predict_labels = classifier.predict(test_features)
return predict_labels
if __name__ == '__main__':
# 数据集路径
data_folder_path = 'dataset'
# 读取数据集
data, labels = read_dataset(data_folder_path)
# 分割数据集
train_data, train_labels, test_data, test_labels = split_dataset(data, labels)
# 特征提取
train_features, test_features = feature_extraction(train_data, test_data)
# 训练模型
classifier = train_model(train_features, train_labels)
# 预测
predict_labels = predict(classifier, test_features)
# 计算准确率
accuracy = sum(1 for i in range(len(predict_labels)) if predict_labels[i] == test_labels[i]) / float(len(predict_labels))
print('Accuracy: {:.2%}'.format(accuracy))
```
其中,数据集的格式为:
```
dataset/
├── class1/
│ ├── file1.txt
│ ├── file2.txt
│ └── ...
├── class2/
│ ├── file1.txt
│ ├── file2.txt
│ └── ...
└── ...
```
其中 `jieba` 库用于中文分词,`TfidfVectorizer` 用于提取特征,`MultinomialNB` 用于训练朴素贝叶斯分类器。
阅读全文