import jieba def get_txt(): txt = open(r"E:\TXT\threekingdoms.txt", "r", encoding="utf-8").read() return txt three_txt = get_txt() three_txt = jieba.lcut(three_txt) # print(three_txt) counts = {} for word in three_txt: if len(word) <= 1: continue if word == "诸葛亮" or word == "孔明曰": rword = "孔明" elif word == "关公" or word == "云长": rword = "关羽" elif word == "玄德" or word == "玄德曰": rword = "刘备" elif word == "孟德" or word == "丞相": rword = "曹操" elif word == "周瑜" or word == "都督": rword = "周瑜" else: rword = word counts[rword] = counts.get(rword, 0) + 1 # 统计词频并在字典中创建键值对 # print(counts) items = list(counts.items()) # 将无序的字典类型转换为可排序的列表类型 items.sort(key=lambda x: x[1], reverse=True) # 以元素的第二列进行从大到小排序 # print(items) for i in range(10): word, count = items[i] print("{:<5}:{:>5}".format(word, count)) # 格式化输出排序结果

from transformers import pipeline, BertTokenizer, BertModel import numpy as np import torch import jieba tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model = BertModel.from_pretrained('bert-base-chinese') ner_pipeline = pipeline('ner', model='bert-base-chinese') with open('output/weibo1.txt', 'r', encoding='utf-8') as f: data = f.readlines() def cosine_similarity(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) def get_word_embedding(word): input_ids = tokenizer.encode(word, add_special_tokens=True) inputs = torch.tensor([input_ids]) outputs = model(inputs)[0][0][1:-1] word_embedding = np.mean(outputs.detach().numpy(), axis=0) return word_embedding def get_privacy_word(seed_word, data): privacy_word_list = [] seed_words = jieba.lcut(seed_word) jieba.load_userdict('data/userdict.txt') for line in data: words = jieba.lcut(line.strip()) ner_results = ner_pipeline(''.join(words)) for seed_word in seed_words: seed_word_embedding = get_word_embedding(seed_word) for ner_result in ner_results: if ner_result['word'] == seed_word and ner_result['entity'] == 'O': continue if ner_result['entity'] != seed_word: continue word = ner_result['word'] if len(word) < 3: continue word_embedding = get_word_embedding(word) similarity = cosine_similarity(seed_word_embedding, word_embedding) print(similarity, word) if similarity >= 0.6: privacy_word_list.append(word) privacy_word_set = set(privacy_word_list) return privacy_word_set 上述代码运行之后，结果为空集合，哪里出问题了，帮我修改一下

with open('output/weibo1.txt', 'r', encoding='utf-8') as f: data = f.readlines() def cosine_similarity(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) def get_word_...

import logging import jieba import gensim from gensim.models import Word2Vec def get_Segment(): texts = [] jieba.load_userdict("data\\name_dict.txt") with open('data\\in_the_name_of_people.txt','r',encoding='utf-8') as f: for line in f.readlines(): texts.append(list(jieba.cut(line.strip()))) with open('data\\in_the_name_of_people_segment.txt','w',encoding='utf-8')as f: for line in texts: f.write(" ".join(w for w in line)) f.write("\r\n") def getmodel(): logging.basicConfig(format='%(asctime)s : %(LeveLname)s : %(message)s',level=logging.INFO) sentences = word2vec.LineSentence('data\\in_the_name_of_people_segment.txt') model = word2vec.Word2Vec(sentences,min_count=1) return model if name=='main': get_Segment() model = getmodel() print('相似度: ',model.wv.similarity('人民','名义')) print(model.wv.similarity('候亮平','钟小艾')) print(model.mv.most_similar('候亮平',topn=10))

with open('data\\in_the_name_of_people.txt','r',encoding='utf-8') as f: for line in f.readlines(): texts.append(list(jieba.cut(line.strip()))) with open('data\\in_the_name_of_people_segment.txt','...

import sys import re import jieba import codecs import gensim import numpy as np import pandas as pd def segment(doc: str): stop_words = pd.read_csv('data/stopwords.txt', index_col=False, quoting=3, names=['stopword'], sep='\n', encoding='utf-8') stop_words = list(stop_words.stopword) reg_html = re.compile(r'<[^>]+>', re.S) # 去掉html标签数字等 doc = reg_html.sub('', doc) doc = re.sub('[０-９]', '', doc) doc = re.sub('\s', '', doc) word_list = list(jieba.cut(doc)) out_str = '' for word in word_list: if word not in stop_words: out_str += word out_str += ' ' segments = out_str.split(sep=' ') return segments def doc2vec(file_name, model): start_alpha = 0.01 infer_epoch = 1000 doc = segment(codecs.open(file_name, 'r', 'utf-8').read()) doc_vec_all = model.infer_vector(doc, alpha=start_alpha, steps=infer_epoch) return doc_vec_all # 计算两个向量余弦值 def similarity(a_vect, b_vect): dot_val = 0.0 a_norm = 0.0 b_norm = 0.0 cos = None for a, b in zip(a_vect, b_vect): dot_val += a * b a_norm += a 2 b_norm += b 2 if a_norm == 0.0 or b_norm == 0.0: cos = -1 else: cos = dot_val / ((a_norm * b_norm) ** 0.5) return cos def test_model(file1, file2): print('导入模型') model_path = 'tmp/zhwk_news.doc2vec' model = gensim.models.Doc2Vec.load(model_path) vect1 = doc2vec(file1, model) # 转成句子向量 vect2 = doc2vec(file2, model) print(sys.getsizeof(vect1)) # 查看变量占用空间大小 print(sys.getsizeof(vect2)) cos = similarity(vect1, vect2) print('相似度：%0.2f%%' % (cos * 100)) if name == 'main': file1 = 'data/corpus_test/t1.txt' file2 = 'data/corpus_test/t2.txt' test_model(file1, file2)

其中，首先使用 jieba 库对文本进行分词，并去除停用词，然后使用 gensim.models.Doc2Vec 中的 infer_vector 方法将文本转化为向量表示，最后使用余弦相似度计算两个向量之间的相似度。该代码中使用了两个测试文件 ...

以下代码出现报错：def chinese_word_cut(mytext): jieba.load_userdict(dic_file) jieba.initialize() try: stopword_list = open(stop_file,encoding ='utf-8') except: stopword_list = [] print("error in stop_file") stop_list = [] flag_list = ['n','nz','vn'] for line in stopword_list: line = re.sub(u'\n|\\r', '', line) stop_list.append(line) word_list = [] #jieba分词 seg_list = psg.cut(mytext) for seg_word in seg_list: word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word) #word = seg_word.word #如果想要分析英语文本，注释这行代码，启动下行代码 find = 0 for stop_word in stop_list: if stop_word == word or len(word)<2: #this word is stopword find = 1 break if find == 0 and seg_word.flag in flag_list: word_list.append(word) return (" ").join(word_list)datacontent=data.content data["content_cutted"] = chinese_word_cut(datacontent)

2. 代码依赖的jieba和re模块需要被引入，需要在代码的开头添加import jieba和import re语句。 3. 变量data未被定义，需要保证在这段代码被执行前，data变量已经被正确定义或赋值。 4. 函数chinese_...

import jieba def word_extract(): # 读取文件 corpus = [] path = 'D:/自然语言处理/第2章/data/金庸-白马啸西风.txt' content = '' for line in open(path, 'r', encoding='gbk', errors='ignore'): line = line.strip() content += line corpus.append(content) # 加载停用词 stop_words = [] path = 'D:/自然语言处理/第4章/data/stopword.txt' for line in open(path, encoding='utf8'): line = line.strip() stop_words.append(line) # jieba分词 split_words = [] word_list = jieba.cut(corpus[0]) for word in word_list: if word not in stop_words: split_words.append(word)这段代码哪里有问题

for line in open(path, encoding='utf8'): line = line.strip() stop_words.append(line) # jieba分词 split_words = [] word_list = jieba.cut(corpus[0]) for word in word_list: if word not in ...

def get_content(html_url): response = get_response(html_url=html_url) selector = parsel.Selector(response.text) title = selector.css('.grap--h2 ::text').get() content_list = selector.css('.grap ::text').getall() content = ''.join(content_list) return title, content def main(): url = f'https://hongloumeng.5000yan.com/' name, link_list = get_novel_info(novel_url=url) for link in link_list: title, content = get_content(html_url=link) save(name, title, content) word=[] wordlist = [] #这里是定义函数加进列表里，想让entry的内容作为变量参与函数的运行 def add_to_list(): text = entry2.get() words.append(text) wordlist = [] for word in sWords: if word in words: wordlist.append(word) def function(): f=open('D:\Python文档保存\红楼梦.txt','r',encoding='utf-8') text = f.read() plt.rcParams['font.sans-serif'] = 'SimHei' sWords = jieba.lcut(text) #wordlist = [] #words = ['刘姥姥', '贾', '王夫人'] #for word in sWords: # if word in words: # wordlist.append(word) word_counts = collections.Counter(wordlist) print(wordlist) print(word_counts) #按钮3 button3=tk.Button(root,text='词频统计图',bg="#FFB6C1",command=func()) button3.place(x=320,y=165) 把前面的代码绑定在这个按钮上，改一下代码

main() 函数利用 get_novel_info() 函数获取小说信息（小说名和链接列表），然后遍历链接列表，调用 get_content() 函数获取每个链接对应的内容，并将小说名、章节名、正文内容一起保存。 word 和 wordlist 两个...

def chinese_word_cut(mytext): jieba.load_userdict(dic_file) jieba.initialize() try: stopword_list = open(stop_file,encoding ='utf-8') except: stopword_list = []

它使用了jieba库来进行分词操作。首先，它加载了一个自定义的词典文件，以便jieba能够识别更多的词汇。然后，它尝试打开一个停用词文件，并将其读取为一个停用词列表。如果无法打开停用词文件，则停用词列表将为空...

def Stop_words(): stopword = [] data = [] f = open('C:/Users/Administrator/Desktop/data/stopword.txt',encoding='utf8') for line in f.readlines(): data.append(line) for i in data: output = str(i).replace('\n','')#replace用法和sub函数很接近 stopword.append(output) return stopword # 采用jieba进行词性标注，对当前文档过滤词性和停用词 def Filter_word(text): filter_word = [] stopword = Stop_words() text = jieba.posseg.cut(text) for word, flag in text: if flag.startswith('n') is False:#用于检测字符串是否以指定的子字符串开始 continue if not word in stopword and len(word) > 1: filter_word.append(word) return filter_word # 对文档集过滤词性和停用词 def Filter_words(data_path =r'C:/Users/Administrator/Desktop/data//corpus.txt'): document = [] for line in open(data_path, 'r',encoding= 'utf8') : segment = jieba.posseg.cut(line.strip()) filter_words = [] stopword = Stop_words() for word, flag in segment: if flag.startswith('n') is False: continue if not word in stopword and len(word) > 1: filter_words.append(word) document.append(filter_words) return document def tf_idf(): tf_dict = {} idf_dict = {} filter_word = Filter_word(text) for word in filter_word: if word not in tf_dict: tf_dict[word] = 1 else: tf_dict[word] += 1 for word in tf_dict: tf_dict[word] = tf_dict[word] / len(text) document = Filter_words() doc_total = len(document) for doc in document: for word in set(doc): if word not in idf_dict: idf_dict[word] = 1 else: idf_dict[word] += 1 for word in idf_dict: idf_dict[word] = math.log(doc_total / (idf_dict[word] + 1)) tf_idf_dict = {} for word in filter_word: if word not in idf_dict: idf_dict[word] = 0 tf_idf_dict[word] = tf_dict[word] * idf_dict[word] return tf_idf_dict tf_idf_dict = tf_idf() keyword = 6 print('TF-IDF模型结果:') for key, value in sorted(tf_idf_dict.items(), key=operator.itemgetter(1),reverse=True)[:keyword]: print(key, end=' ') print('\n')

这段代码实现了一个 TF-IDF 模型，用于计算文本中关键词的权重。其中，Stop_words 函数用于读取停用词表，Filter_word 函数用于对单个文档进行过滤，Filter_words 函数用于对整个文档集进行过滤。...

import jieba import re from tokenizer import cut_hanlp jieba.load_userdict("dict.txt") def merge_two_list(a, b): c=[] len_a, len_b = len(a), len(b) minlen = min(len_a, len_b) for i in range(minlen): c.append(a[i]) c.append(b[i]) if len_a > len_b: for i in range(minlen, len_a): c.append(a[i]) else: for i in range(minlen, len_b): c.append(b[i]) return c if name=="main": fp=open("text.txt","r",encoding="utf8") fout=open("result_cut.txt","w",encoding="utf8") # 保存结果 regex1=u'(?:[^\u4e00-\u9fa5（）*&……%￥$，,。.@! ！]){1,5}期' #打开非汉子的正则模式， xxx期 regex2=r'(?:[0-9]{1,3}[.]?[0-9]{1,3})%' #打开非汉子的正则模式， xxx.xxx% p1=re.compile(regex1) p2=re.compile(regex2) for line in fp.readlines(): result1=p1.findall(line) #是否有正则表达式， if result1: regex_re1=result1 line=p1.sub("FLAG1",line) #如果有用XXX期，FLAG1代替 result2=p2.findall(line) if result2: line=p2.sub("FLAG2",line) #如果有用xxx%，用FLAG2代替 words=jieba.cut(line) words1=cut_hanlp(line) result=" ".join(words) if "FLAG1" in result: result=result.split("FLAG1") # 从FLAG1处断开 result=merge_two_list(result,result1) result="".join(result) if "FLAG2" in result: result=result.split("FLAG2") result=merge_two_list(result,result2) result="".join(result) #print(result) fout.write(result) fout.close()

- 导入 jieba 和 re 模块，以及 tokenizer 模块中的 cut_hanlp 函数。 - 使用 jieba.load_userdict 函数加载自定义词典 dict.txt。 - 定义函数 merge_two_list，用于将两个列表按照顺序合并。 - 在 ...

修改 import jieba import jieba.posseg as pseg path = r'D:/明朝collection.txt' with open(path, 'r', encoding='utf-8') as f: text = f.read() jieba.enable_paddle() words = pseg.cut(text, use_paddle=True) counts = {} for word in words: if len(word) == 1: continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(500): word, count = items[i] print("{0:<5}{1:>5}".format(word, count))

with open(path, 'r', encoding='utf-8') as f: text = f.read() jieba.enable_paddle() words = pseg.cut(text, use_paddle=True) counts = {} for word, flag in words: if len(word) == 1: continue else: ...

import requests from bs4 import BeautifulSoup import jieba.analyse import jieba.posseg as pseg from snownlp import SnowNLP import matplotlib.pyplot as plt # 设置请求头，模拟浏览器访问 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} # 获取网页内容 def get_html(url): resp = requests.get(url, headers=headers) resp.encoding = resp.apparent_encoding html = resp.text return html # 获取新闻列表 def get_news_list(url): html = get_html(url) soup = BeautifulSoup(html, 'html.parser') news_list = soup.find_all('a', class_="news_title") return news_list # 对文本进行情感分析 def sentiment_analysis(text): s = SnowNLP(text) return s.sentiments # 对文本进行关键词提取 def keyword_extraction(text): keywords = jieba.analyse.extract_tags(text, topK=10, withWeight=True, allowPOS=('n', 'vn', 'v')) return keywords # 对新闻进行分析 def analyze_news(url): news_list = get_news_list(url) senti_scores = [] # 情感分数列表 keyword_dict = {} # 关键词词频字典 for news in news_list: title = news.get_text().strip() link = news['href'] content = get_html(link) soup = BeautifulSoup(content, 'html.parser') text = soup.find('div', class_='article').get_text().strip() # 计算情感分数 senti_score = sentiment_analysis(text) senti_scores.append(senti_score) # 提取关键词 keywords = keyword_extraction(text) for keyword in keywords: if keyword[0] in keyword_dict: keyword_dict[keyword[0]] += keyword[1] else: keyword_dict[keyword[0]] = keyword[1] # 绘制情感分数直方图 plt.hist(senti_scores, bins=10, color='skyblue') plt.xlabel('Sentiment Score') plt.ylabel('Number of News') plt.title('Sentiment Analysis') plt.show() # 输出关键词词频排名 keyword_list = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True) print('Top 10 keywords:') for i in range(10): print('{}. {} - {:.2f}'.format(i+1, keyword_list[i][0], keyword_list[i][1])) if name == 'main': url = 'https://www.sina.com.cn/' analyze_news(url)

在主函数中，它调用了get_news_list()函数来获取新闻列表，然后对每篇新闻进行情感分析和关键词提取，并将情感分数和关键词词频存储到列表和字典中。最后，它绘制情感分数直方图，输出关键词词频排名。

import csv import jieba.posseg as pseg import jieba import paddle path = r'D:/明朝collection.txt' with open(path, 'r', encoding='utf-8') as f: text = f.read() jieba.enable_paddle() words = pseg.cut(text, use_paddle=True) counts = {} for word, flag in words: if len(word) == 1: continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) # 将结果写入CSV文件 with open('D:/output.csv', 'w', encoding='utf-8-sig', newline='') as f: writer = csv.writer(f) writer.writerow(['Word', 'Count', 'Flag']) # 写入表头 for i in range(500): word, count = items[i] print("{0:<5}{1:>5}".format(word, count)) words_with_flag = pseg.cut(word, use_paddle=True) for w, f in words_with_flag: print(f) writer.writerow([word, count,flag])

with open(path, 'r', encoding='utf-8') as f: text = f.read() jieba.enable_paddle() words = pseg.cut(text, use_paddle=True) counts = {} for word, flag in words: if len(word) == 1: continue else: ...

用python完成实践（中文词频统计）：对中文词频统计项目（import jieba import wordcloud class ChineseWordCounter: def init(self): self.content = '' self.words = [] self.word_frequencies = {} def readfile_demo_with(self, filename): with open(filename, encoding='utf-8') as fp: self.content = fp.read() def seperate_words(self): ignore_word = ('的', '与', '个', '和') for word in jieba.cut(self.content, cut_all=False): if (len(word) > 1) and (word not in ignore_word): self.words.append(word) def count_words(self): for word in self.words: if word in self.word_frequencies.keys(): self.word_frequencies[word] += 1 else: self.word_frequencies[word] = 1 def draw_wordcloud(self, img_file): font = 'c:/windows/font/方正粗黑宋简体.ttf' cloudengine = wordcloud.WordCloud(font_path=font) cloudengine.generate(" ".join(self.words)) cloudengine.to_file(img_file)）进行修改，把统计出的词频保存到指定的csv文件中。

with open(filename, encoding='utf-8') as fp: self.content = fp.read() def separate_words(self): ignore_word = ('的', '与', '个', '和') for word in jieba.cut(self.content, cut_all=False): if ...

一个基于Qt Creator（qt,C++）实现中国象棋人机对战

qt 一个基于Qt Creator（qt,C++）实现中国象棋人机对战.

热带雨林自驾游自然奇观探索.doc

热带雨林自驾游自然奇观探索

相关推荐

jieba-0_jieba安装0.4.1_jieba-0.42.1.tar_jieba-0.42.1.tar.gz_jieba-

Python库 | jieba_fast-0.51.tar.gz

jieba-lucene-analiysis:jieba的lucene中文分析器和令牌生成器

def chinese_word_cut(mytext): jieba.load_userdict(dic_file) jieba.initialize() try: stopword_list = open(stop_file,encoding ='utf-8') except: stopword_list = []

一个基于Qt Creator（qt,C++）实现中国象棋人机对战

热带雨林自驾游自然奇观探索.doc

大家在看

华为CloudIVS 3000技术主打胶片v1.0（C20190226）.pdf

dosbox:适用于Android的DosBox Turbo FreeBox

功率谱密度：时间历程的功率谱密度。-matlab开发

南京工业大学Python程序设计语言题库及答案

Windows6.1--KB2533623-x64.zip

最新推荐

一个基于Qt Creator（qt,C++）实现中国象棋人机对战

热带雨林自驾游自然奇观探索.doc

Windows下操作Linux图形界面的VNC工具

【SketchUp Ruby API：从入门到精通】

VMware虚拟机打开虚拟网络编辑器出现由于找不到vnetlib.dll,无法继续执行代码。重新安装程序可能会解决问题

基于Preact的高性能PWA实现定期天气信息更新

从停机到上线，EMC VNX5100控制器SP更换的实战演练

ubuntu labelme中文版安装

全新免费HTML5商业网站模板发布

EMC VNX5100控制器SP更换全流程指南：新手到高手的必备技能