python实现tf-idf中文文本分类
时间: 2023-08-01 15:07:20 浏览: 269
下面是使用Python实现tf-idf中文文本分类的示例代码:
```python
import jieba
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 读取文本数据
def read_text(file_path):
texts = []
labels = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
label, text = line.strip().split('\t')
texts.append(text)
labels.append(label)
return texts, labels
# 分词
def cut_words(texts):
cut_texts = []
for text in texts:
cut_text = ' '.join(jieba.cut(text))
cut_texts.append(cut_text)
return cut_texts
# 构建词典
def build_vocab(cut_texts, k):
all_words = []
for cut_text in cut_texts:
words = cut_text.split(' ')
all_words.extend(words)
counter = Counter(all_words)
vocab = {word: i for i, (word, count) in enumerate(counter.most_common(k))}
return vocab
# 计算TF-IDF值
def compute_tfidf(cut_texts, vocab):
vectorizer = CountVectorizer(vocabulary=vocab)
transformer = TfidfTransformer()
counts = vectorizer.fit_transform(cut_texts)
tfidf = transformer.fit_transform(counts)
return tfidf.toarray()
# 训练分类器
def train(x_train, y_train):
clf = MultinomialNB()
clf.fit(x_train, y_train)
return clf
# 测试分类器
def test(clf, x_test, y_test):
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print("accuracy:", acc)
# 主函数
if __name__ == '__main__':
# 读取数据
texts, labels = read_text('data.txt')
# 分词
cut_texts = cut_words(texts)
# 构建词典
vocab = build_vocab(cut_texts, 10000)
# 计算TF-IDF值
tfidf = compute_tfidf(cut_texts, vocab)
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(tfidf, labels, test_size=0.2, random_state=42)
# 训练分类器
clf = train(x_train, y_train)
# 测试分类器
test(clf, x_test, y_test)
```
其中,`read_text`函数用于读取文本数据,每行数据包括标签和文本,可以根据实际情况进行修改。`cut_words`函数用于对文本进行分词,使用jieba分词工具。`build_vocab`函数用于构建词典,选取出现频率最高的k个词语作为特征。`compute_tfidf`函数用于计算TF-IDF值,使用sklearn库中的CountVectorizer和TfidfTransformer类。`train`函数用于训练分类器,使用朴素贝叶斯算法。`test`函数用于测试分类器,计算分类准确率。主函数中的代码完成了整个流程,可以根据实际情况进行修改。
阅读全文