写出基于TF-IDF+KNN进行txt文件格式的中文文本分类的代码
时间: 2023-12-14 11:36:26 浏览: 79
以下是基于TF-IDF和KNN的中文文本分类代码示例:
```python
import jieba
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
# 读取文件,返回文本和标签列表
def load_data(path):
texts = []
labels = []
for label in os.listdir(path):
label_path = os.path.join(path, label)
for file_name in os.listdir(label_path):
file_path = os.path.join(label_path, file_name)
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
texts.append(text)
labels.append(label)
return texts, labels
# 分词
def cut_words(texts):
cut_texts = []
for text in texts:
cut_text = ' '.join(jieba.cut(text))
cut_texts.append(cut_text)
return cut_texts
# 构建词向量
def build_tfidf_vectorizer(texts):
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit_transform(texts)
return tfidf_vectorizer
# 计算相似度矩阵
def compute_similarity(tfidf_vectorizer, texts):
tfidf_matrix = tfidf_vectorizer.transform(texts)
similarity_matrix = tfidf_matrix * tfidf_matrix.T
return similarity_matrix.toarray()
# 基于KNN的文本分类
def text_classification(train_texts, train_labels, test_texts, n_neighbors=5):
# 分词
train_cut_texts = cut_words(train_texts)
test_cut_texts = cut_words(test_texts)
# 构建词向量
tfidf_vectorizer = build_tfidf_vectorizer(train_cut_texts)
# 计算相似度矩阵
similarity_matrix = compute_similarity(tfidf_vectorizer, train_cut_texts)
# 构建KNN模型
knn_model = KNeighborsClassifier(n_neighbors=n_neighbors, metric='precomputed')
# 训练KNN模型
knn_model.fit(similarity_matrix, train_labels)
# 预测测试集标签
test_similarity_matrix = compute_similarity(tfidf_vectorizer, test_cut_texts)
predict_labels = knn_model.predict(test_similarity_matrix)
return predict_labels
# 测试代码
if __name__ == '__main__':
path = 'data' # 数据集路径
train_texts, train_labels = load_data(os.path.join(path, 'train'))
test_texts, test_labels = load_data(os.path.join(path, 'test'))
predict_labels = text_classification(train_texts, train_labels, test_texts)
print(predict_labels) # 输出预测的标签
```
需要注意的是,该代码中使用的是基于文件夹的数据集格式,即每个文件夹代表一个类别,文件夹中包含若干个文本文件。如果您的数据集格式不同,需要根据实际情况进行修改。
阅读全文