import torch from transformers import BertTokenizer, BertModel # 加载种子词库 seed_words = [] with open("output/base_words.txt", "r", encoding="utf-8") as f: for line in f: seed_words.append(line.strip()) print(seed_words) # 加载微博文本数据 text_data = [] with open("output/weibo1.txt", "r", encoding="utf-8") as f: for line in f: text_data.append(line.strip()) print(text_data) # 加载BERT模型和分词器 tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model = BertModel.from_pretrained('bert-base-chinese') # 构建隐私词库 privacy_words = set(seed_words) for text in text_data: # 对文本进行分词,并且添加特殊标记 tokens = ["[CLS]"] + tokenizer.tokenize(text) + ["[SEP]"] token_ids = tokenizer.convert_tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) # 转换为张量,调用BERT模型进行编码 token_tensor = torch.tensor([token_ids]) segment_tensor = torch.tensor([segment_ids]) with torch.no_grad(): outputs = model(token_tensor, segment_tensor) encoded_layers = outputs[0] # 对于每个词,计算它与种子词的相似度 for i in range(1, len(tokens)-1): word = tokens[i] if word in seed_words: continue word_tensor = encoded_layers[0][i].reshape(1, -1) sim = cosine_similarity(word_tensor, encoded_layers[0][1:-1])[0].max() # if sim > 0.5: # privacy_words.add(word) # 输出隐私词库 with open("output/privacy_words.txt", "w", encoding="utf-8") as f: for word in privacy_words: f.write(word + "\n") 上述代码中的 sim = cosine_similarity(word_tensor, encoded_layers[0][1:-1])[0].max() 的 cosine_similarity()应该用的是哪个库中的,是正确的
时间: 2024-02-23 07:57:25 浏览: 114
头条中文新闻训练集、验证集、测试集toutiao_cat_data.(train/dev/test).txt
这段代码中使用的 `cosine_similarity()` 函数应该是来自 `sklearn.metrics.pairwise` 库中的,用于计算两个向量之间的余弦相似度。此外,代码中使用了 `torch` 和 `transformers` 库来加载和使用预训练的BERT模型和分词器,以及进行张量操作。
阅读全文