import jieba def get_txt(): txt = open(r"E:\TXT\threekingdoms.txt", "r", encoding="utf-8").read() return txt three_txt = get_txt() three_txt = jieba.lcut(three_txt) # print(three_txt) counts = {} for word in three_txt: if len(word) <= 1: continue if word == "诸葛亮" or word == "孔明曰": rword = "孔明" elif word == "关公" or word == "云长": rword = "关羽" elif word == "玄德" or word == "玄德曰": rword = "刘备" elif word == "孟德" or word == "丞相": rword = "曹操" elif word == "周瑜" or word == "都督": rword = "周瑜" else: rword = word counts[rword] = counts.get(rword, 0) + 1 # 统计词频并在字典中创建键值对 # print(counts) items = list(counts.items()) # 将无序的字典类型转换为可排序的列表类型 items.sort(key=lambda x: x[1], reverse=True) # 以元素的第二列进行从大到小排序 # print(items) for i in range(10): word, count = items[i] print("{:<5}:{:>5}".format(word, count)) # 格式化输出排序结果
时间: 2023-05-23 20:06:03 浏览: 216
搜狗金融词库txt已修改格式版
这段代码首先通过导入`jieba`模块来分词。然后定义了一个函数`get_txt()`,该函数用于打开指定路径下的一个以`utf-8`编码的文本文件,并将文件内容读入到一个变量`txt`中,最后将`txt`变量返回。接着,调用该函数将文本文件中的内容读取并存储到变量`three_txt`中。最后,用`jieba.lcut()`方法对`three_txt`进行分词处理,并将处理结果再次存储到`three_txt`变量中。
阅读全文