bert-lda的代码
时间: 2023-11-12 20:09:48 浏览: 211
lda算法代码
以下是使用BERT和LDA模型进行文本主题建模的Python代码示例:
```python
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import torch
from transformers import BertTokenizer, BertModel
# 载入数据
data = pd.read_csv('data.csv')
# 清理数据
def preprocess(text):
text = re.sub(r'\d+', '', text) # 去除数字
text = re.sub(r'[^\w\s]', '', text) # 去除标点符号
text = text.lower() # 转换为小写
text = nltk.word_tokenize(text) # 分词
text = [word for word in text if len(word)>2] # 去除短词
text = [word for word in text if not word in nltk.corpus.stopwords.words('english')] # 去除停用词
text = ' '.join(text)
return text
data['text'] = data['text'].apply(preprocess)
# 使用BERT获取文本向量
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def get_vectors(text):
input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
outputs = model(input_ids)
last_hidden_states = outputs[0][:, 0, :].detach().numpy()
return last_hidden_states
vectors = np.concatenate([get_vectors(text) for text in data['text']], axis=0)
# 使用LDA进行主题建模
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
X = vectorizer.fit_transform(data['text'])
lda = LatentDirichletAllocation(n_components=10, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda.fit(X)
# 输出主题及其相关的词语
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))
print()
```
在此代码示例中,我们首先使用了BERT模型获取了每个文本的向量表示,然后使用LDA模型进行主题建模。该代码示例仅供参考。
阅读全文