import jieba def get_txt(): txt = open(r"E:\TXT\threekingdoms.txt", "r", encoding="utf-8").read() return txt three_txt = get_txt() three_txt = jieba.lcut(three_txt) # print(three_txt) counts = {} for word in three_txt: if len(word) <= 1: continue if word == "诸葛亮" or word == "孔明曰": rword = "孔明" elif word == "关公" or word == "云长": rword = "关羽" elif word == "玄德" or word == "玄德曰": rword = "刘备" elif word == "孟德" or word == "丞相": rword = "曹操" elif word == "周瑜" or word == "都督": rword = "周瑜" else: rword = word counts[rword] = counts.get(rword, 0) + 1 # 统计词频并在字典中创建键值对 # print(counts) items = list(counts.items()) # 将无序的字典类型转换为可排序的列表类型 items.sort(key=lambda x: x[1], reverse=True) # 以元素的第二列进行从大到小排序 # print(items) for i in range(10): word, count = items[i] print("{:<5}:{:>5}".format(word, count)) # 格式化输出排序结果
时间: 2023-05-23 22:06:03 浏览: 210
这段代码首先通过导入`jieba`模块来分词。然后定义了一个函数`get_txt()`,该函数用于打开指定路径下的一个以`utf-8`编码的文本文件,并将文件内容读入到一个变量`txt`中,最后将`txt`变量返回。接着,调用该函数将文本文件中的内容读取并存储到变量`three_txt`中。最后,用`jieba.lcut()`方法对`three_txt`进行分词处理,并将处理结果再次存储到`three_txt`变量中。
相关问题
def chinese_word_cut(mytext): jieba.load_userdict(dic_file) jieba.initialize() try: stopword_list = open(stop_file,encoding ='utf-8') except: stopword_list = [] print("error in stop_file") stop_list = [] flag_list = ['n','nz','vn'] for line in stopword_list: line = re.sub(u'\n|\\r', '', line) stop_list.append(line) word_list = [] #jieba分词 seg_list = psg.cut(mytext) for seg_word in seg_list: #word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word) word = seg_word.word find = 0 for stop_word in stop_list: if stop_word == word or len(word)<2: #this word is stopword find = 1 break if find == 0 and seg_word.flag in flag_list: if word in synonym_origin: index = synonym_origin.index(word) word = synonym_new[index] word_list.append(word) return (" ").join(word_list) data["content"]=data.content.astype(str) data["content_cutted"] = data.content.apply(chinese_word_cut)加入正则表达式进行数据清洗
这段代码是一个用于中文分词和数据清洗的函数。首先,它使用了jieba库加载用户自词典,并初始化。然后,它尝试打开停用词文件,如果失败则将停用词列表设置为空并打印错误信息。接下来,它定义了一个停用词列表和一个词性标注列表。在进行分词时,对每个分词结果,它会检查是否为停用词或长度小于2,如果是则忽略;否则,它会检查该词的词性是否在标注列表中,如果是则将该词加入到结果列表中。最后,它将结果列表中的词用空格连接起来,作为分词结果返回。
在使用该函数时,它会将输入数据中的content列转换为字符串类型,并将分词结果保存在新的列content_cutted中。同时,它也使用了正则表达式进行数据清洗,但具体是什么样的清洗操作需要看stop_file和synonym_origin、synonym_new文件中的内容。
苏格拉底是古希腊著名的思想家、哲学家、教育家、公民陪审员。苏格拉底的部分名言被翻译为中文,其部分内容由sgld.txt给出。请参考代码模板,补充代码完成中文分词和统计“人”出现的次数。 import jieba with open("sgld.txt","r",encoding ="utf-8")as f: lssgld = ___ fo = open("sgldout.txt","w",encoding ="utf-8") cishu = 0 for ls in ___: ls =ls.strip() wordlist = list(___) for w in ___: if ("人" in w): cishu += 1 fo.writelines("\n".___) fo.close() print(cishu)
import jieba
with open("sgld.txt", "r", encoding="utf-8") as f:
lssgld = f.read()
fo = open("sgldout.txt", "w", encoding="utf-8")
cishu = 0
for ls in jieba.cut(lssgld):
ls = ls.strip()
wordlist = list(ls)
for w in wordlist:
if ("人" in w):
cishu += 1
fo.writelines("\n".join(wordlist))
fo.close()
print(cishu)
阅读全文