请写一段代码用LDA对文本进行主题提取,并用主题提取的结果使用逻辑回归进行文本分类
时间: 2023-12-14 18:36:38 浏览: 81
以下是一个Python示例,使用gensim库实现LDA主题建模和sklearn库实现逻辑回归文本分类:
```python
# 导入所需库
import pandas as pd
import numpy as np
import re
import nltk
from gensim import corpora, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 读取文本数据
data = pd.read_csv('text.csv', encoding='utf-8')
# 数据清洗和预处理
def clean_text(text):
text = re.sub(r'[^\w\s]', '', text) # 去除标点符号
text = re.sub(r'\d+', '', text) # 去除数字
text = text.lower() # 转换为小写
text = nltk.word_tokenize(text) # 分词
text = [word for word in text if len(word) > 2] # 去除长度小于3的单词
text = [word for word in text if word not in nltk.corpus.stopwords.words('english')] # 去除停用词
return text
data['text_clean'] = data['text'].apply(lambda x: clean_text(x))
# 创建词袋模型
dictionary = corpora.Dictionary(data['text_clean'])
corpus = [dictionary.doc2bow(text) for text in data['text_clean']]
# 训练LDA模型
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
# 提取主题分布
def get_topic_distribution(lda_model, text):
topic_distribution = np.array([0.0] * lda_model.num_topics)
bow_vector = dictionary.doc2bow(text)
if bow_vector:
for topic in lda_model[bow_vector]:
topic_distribution[topic[0]] = topic[1]
return topic_distribution
data['topic_distribution'] = data['text_clean'].apply(lambda x: get_topic_distribution(lda_model, x))
# 将主题分布作为特征
X = np.array([x for x in data['topic_distribution']])
y = np.array(data['category'])
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练逻辑回归模型
clf = LogisticRegression()
clf.fit(X_train, y_train)
# 预测
y_pred = clf.predict(X_test)
# 计算准确率
accuracy = clf.score(X_test, y_test)
print('Accuracy:', accuracy)
```
其中,`text.csv`是包含文本和类别信息的CSV文件,`clean_text()`函数用于清洗和预处理文本数据,`get_topic_distribution()`函数用于提取文本的主题分布,`X`和`y`分别是特征和标签,`train_test_split()`函数用于划分训练集和测试集,`LogisticRegression()`函数用于创建逻辑回归模型,`clf.fit()`函数用于训练模型,`clf.predict()`函数用于预测测试集标签,`clf.score()`函数用于计算模型的准确率。
阅读全文