import torchfrom transformers import BertTokenizer, BertModel# 加载Bert预训练模型和tokenizermodel = BertModel.from_pretrained('bert-base-chinese')tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')# 微博文本和种子词text = '今天天气真好,心情非常愉快!'seeds = ['天气', '心情', '愉快']# 将微博文本和种子词转换为Bert输入格式inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')seed_inputs = tokenizer.encode_plus(seeds, add_special_tokens=True, return_tensors='pt', padding=True)# 使用Bert模型获取微博文本和种子词的词向量with torch.no_grad(): text_embeddings = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])[0] # [1, seq_len, hidden_size] seed_embeddings = model(seed_inputs['input_ids'], attention_mask=seed_inputs['attention_mask'])[0] # [batch_size, seq_len, hidden_size]# 计算种子词和微博文本中所有词语的余弦相似度text_embeddings = text_embeddings.squeeze(0) # [seq_len, hidden_size]seed_embeddings = seed_embeddings.mean(dim=1) # [batch_size, seq_len, hidden_size] -> [batch_size, hidden_size]cosine_similarities = torch.matmul(text_embeddings, seed_embeddings.transpose(0, 1)) # [seq_len, batch_size]# 获取相似度最高的词语similar_words = []for i in range(len(seeds)): seed_similarities = cosine_similarities[i, :].tolist() max_sim_idx = seed_similarities.index(max(seed_similarities)) similar_word = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][max_sim_idx].item()) similar_words.append(similar_word)print(similar_words) 上述修改后的代码输出全是['[CLS]', '[CLS]', '[CLS]'],这不是我想要的结果啊,我想要的是微博文本的词语和种子词很相似的所有词语,而不是bert自动添加的特殊标记符,该怎么办
时间: 2023-07-15 13:10:24 浏览: 65
您需要在使用BertTokenizer对文本和种子词进行编码时,将`add_special_tokens`参数设置为False。这样就不会添加特殊标记符了。修改后的代码如下:
```
import torch
from transformers import BertTokenizer, BertModel
# 加载Bert预训练模型和tokenizer
model = BertModel.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 微博文本和种子词
text = '今天天气真好,心情非常愉快!'
seeds = ['天气', '心情', '愉快']
# 将微博文本和种子词转换为Bert输入格式
inputs = tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
seed_inputs = tokenizer.encode_plus(seeds, add_special_tokens=False, return_tensors='pt', padding=True)
# 使用Bert模型获取微博文本和种子词的词向量
with torch.no_grad():
text_embeddings = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])[0] # [1, seq_len, hidden_size]
seed_embeddings = model(seed_inputs['input_ids'], attention_mask=seed_inputs['attention_mask'])[0] # [batch_size, seq_len, hidden_size]
# 计算种子词和微博文本中所有词语的余弦相似度
text_embeddings = text_embeddings.squeeze(0) # [seq_len, hidden_size]
seed_embeddings = seed_embeddings.mean(dim=1) # [batch_size, seq_len, hidden_size] -> [batch_size, hidden_size]
cosine_similarities = torch.matmul(text_embeddings, seed_embeddings.transpose(0, 1)) # [seq_len, batch_size]
# 获取相似度最高的词语
similar_words = []
for i in range(len(seeds)):
seed_similarities = cosine_similarities[i, :].tolist()
max_sim_idx = seed_similarities.index(max(seed_similarities))
similar_word = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][max_sim_idx].item())
similar_words.append(similar_word)
print(similar_words)
```
这样就可以得到微博文本的词语和种子词很相似的所有词语了。