提取其中代码,给出可运行代码
时间: 2024-12-06 16:25:17 浏览: 12
以下是文档中提到的可运行代码片段,经过整理和简化:
### 导入库
```python
import json
from matplotlib import pyplot as plt
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from queue import PriorityQueue as PQueue
from functools import reduce
```
### 读取文件
```python
def read_corpus():
"""
读取给定的语料库,并把问题列表和答案列表分别写入到 qlist, alist 里面。
"""
qlist = []
alist = []
with open("data/train-v2.0.json", 'r') as path:
fileJson = json.load(path)
json_list = fileJson['data']
for data_dict in json_list:
for data_key in data_dict:
if data_key == "paragraphs":
paragraphs_list = data_dict[data_key]
for content_dict in paragraphs_list:
for qas_key in content_dict:
if "qas" == qas_key:
qas_list = content_dict[qas_key]
for q_a_dict in qas_list:
if len(q_a_dict["answers"]) > 0:
qlist.append(q_a_dict["question"])
alist.append(q_a_dict["answers"][0]["text"])
assert len(qlist) == len(alist) # 确保长度一致
return qlist, alist
```
### 数据分析
```python
def data_analysis(data):
"""
统计一下在qlist总共出现了多少个单词?总共出现了多少个不同的单词?
并画出出现频率前100的单词的图。
"""
qlist_word = []
word_dic = {}
for sentences in data:
cur_word = sentences[:len(sentences) - 1].strip().split(" ")
qlist_word += cur_word
for word in cur_word:
if word in word_dic.keys():
word_dic[word] = word_dic[word] + 1
else:
word_dic[word] = 1
word_total = len(set(qlist_word))
word_dic = sorted(word_dic.items(), key=lambda x: x[1], reverse=True)
x = range(100)
y = [c[1] for c in word_dic[:100]]
plt.figure()
plt.plot(x, y)
plt.show()
qlist, alist = read_corpus()
data_analysis(qlist)
```
### 文本预处理
```python
def data_pre(temp_list):
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
word_list_list = []
word_dict = {}
for line in temp_list:
temp_word_list = []
sentence = pattern.sub("", line) # 去掉一些无用的符号
sentence = sentence.lower() # 转换成lower_case
word_list = sentence.split()
for word in word_list:
if word not in stop_words: # 过滤停用词
word = "#number" if word.isdigit() else word # 数字特殊处理
word = stemmer.stem(word) # 词干提取
word_dict[word] = word_dict.get(word, 0) + 1
temp_word_list.append(word)
word_list_list.append(temp_word_list)
return word_dict, word_list_list
def filter_words(in_list=[], in_dict={}, lower=0, upper=0):
word_list = []
for key, val in in_dict.items():
if val >= lower and val <= upper:
word_list.append(key)
new_list = []
for line in in_list:
words = [w for w in line if w in word_list]
new_list.append(' '.join(words))
return new_list
```
### 文本表示
```python
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(qlist)
def top5results(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。
"""
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
input_q = pattern.sub("", input_q) # 去掉一些无用的符号
input_q = input_q.lower() # 转换成lower_case
word_list = input_q.split()
temp_word_list = []
for word in word_list:
if word not in stop_words: # 过滤停用词
word = "#number" if word.isdigit() else word # 数字特殊处理
word = stemmer.stem(word) # 词干提取
temp_word_list.append(word)
new_input = ' '.join(temp_word_list)
input_vec = vectorizer.transform([new_input])
res = cosine_similarity(input_vec, X)[0]
pq = PQueue()
for i, v in enumerate(res):
pq.put((1.0 - v, i))
top_idxs = []
for i in range(5):
top_idxs.append(pq.get()[1])
result = [alist[i] for i in top_idxs]
return result
qlist, alist = read_corpus()
q_dict, q_list_list = data_pre(qlist)
new_qlist = filter_words(q_list_list, q_dict, 2, 1000)
print(top5results("when did Beyonce start becoming popular?"))
print(top5results("what language does the word of 'symbiosis' come from"))
```
### 使用倒排表优化
```python
inverted_idx = {}
for i in range(len(qlist)):
for word in qlist[i].split():
if word in inverted_idx:
inverted_idx[word].append(i)
else:
inverted_idx[word] = [i]
for key in inverted_idx:
inverted_idx[key] = sorted(inverted_idx[key])
def intersections(set1, set2):
return set1.intersection(set2)
def top5results_invidx(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。
"""
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
sentence = pattern.sub("", input_q)
sentence = sentence.lower()
word_list = sentence.split()
result_list = []
for word in word_list:
if word not in stop_words:
word = "#number" if word.isdigit() else word
word = stemmer.stem(word)
result_list.append(word)
candidate_list = []
for word in result_list:
if word in inverted_idx:
idx_list = inverted_idx[word]
candidate_list.append(set(idx_list))
candidate_idx = list(reduce(intersections, candidate_list))
input_seg = ' '.join(result_list)
vectorizer = TfidfVectorizer(smooth_idf=False)
X = vectorizer.fit_transform(new_qlist)
input_vec = vectorizer.transform([input_seg])
similarity_list = []
for i in candidate_idx:
similarity = cosine_similarity(input_vec, X[i])[0]
similarity_list.append((i, similarity[0]))
res_sorted = sorted(similarity_list, key=lambda k: k[1], reverse=True)
answers = []
i = 0
for (idx, score) in res_sorted:
if i < 5:
answer = alist[idx]
answers.append(answer)
i += 1
return answers
print(top5results_invidx("when did Beyonce start becoming popular?"))
print(top5results_invidx("what language does the word of 'symbiosis' come from"))
```
### 词向量表示
```python
def load_glove(path):
vocab = {"UNK": 0}
embedding = [[0] * 100]
with open(path, 'r', encoding='utf8') as f:
i = 1
for line in f:
row = line.strip().split()
vocab[row[0]] = i
embedding.append(list(map(float, row[1:])))
i += 1
return vocab, embedding
def word_to_vec(sentence, vocab, embedding):
words = sentence.split()
vecs = []
for word in words:
if word in vocab:
vecs.append(embedding[vocab[word]])
if vecs:
return np.mean(vecs, axis=0)
else:
return np.zeros(100)
vocab, embedding = load_glove("data/glove.6B.100d.txt")
def top5results_emb(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。
"""
stop_words = set(stopwords.words('english'))
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
sentence = pattern.sub("", input_q)
sentence = sentence.lower()
word_list = sentence.split()
result_list = []
for word in word_list:
if word not in stop_words:
word = "#number" if word.isdigit() else word
result_list.append(word)
input_q = " ".join(result_list)
inverted_idx = {}
for i in range(len(new_qlist)):
for word in new_qlist[i].split():
if word in inverted_idx:
inverted_idx[word].append(i)
else:
inverted_idx[word] = [i]
for key in inverted_idx:
inverted_idx[key] = sorted(inverted_idx[key])
candidates = []
for word in result_list:
if word in inverted_idx:
ids = inverted_idx[word]
candidates.append(set(ids))
candidate_idx = list(reduce(intersections, candidates))
input_q_vec = word_to_vec(input_q, vocab, embedding)
scores = []
for i in candidate_idx:
vec = word_to_vec(new_qlist[i], vocab, embedding)
score = cosine_similarity([input_q_vec, vec])[0][1]
scores.append((i, score))
scores_sorted = sorted(scores, key=lambda k: k[1], reverse=True)
answers = []
i = 0
for (idx, score) in scores_sorted:
if i < 5:
answer = alist[idx]
answers.append(answer)
i += 1
return answers
print(top5results_emb("when did Beyonce start becoming popular?"))
print(top5results_emb("what language does the word of 'symbiosis' come from"))
print(top5results_emb("In her music, what are some?"))
```
以上代码涵盖了从读取文件、数据分析、文本预处理、文本表示到最终的相似度计算和答案返回的全过程。你可以根据实际需求调整参数和路径。希望这些代码对你有帮助!
阅读全文