帮我使用python中jieba库实现文档查重,关键词提取,中心思想提取等功能,并打出来
时间: 2023-06-09 14:08:24 浏览: 82
我可以帮您使用Python中的jieba库来实现文档查重、关键词提取和中心思想提取等功能。以下是代码示例:
# 导入jieba库
import jieba
import jieba.analyse
# 读取文档
with open('document.txt', 'r', encoding='utf-8') as f:
doc = f.read()
# 文档查重
def document_similarity(doc1, doc2):
# 将文本转换成词语列表
words1 = jieba.lcut(doc1)
words2 = jieba.lcut(doc2)
# 将词语列表转换成集合
set1 = set(words1)
set2 = set(words2)
# 计算交集和并集
intersection = set1 & set2
union = set1 | set2
# 计算相似度
similarity = len(intersection) / len(union)
return similarity
# 调用文档查重函数
similarity = document_similarity(doc, doc)
print('文档相似度为:%.2f' % similarity)
# 关键词提取
keywords = jieba.analyse.extract_tags(doc, topK=10, withWeight=True, allowPOS=('n', 'vn', 'v'))
print('关键词:')
for keyword, weight in keywords:
print(keyword, weight)
# 中心思想提取
sentences = doc.split('。') # 将文本按照句号分割成句子列表
keywords = jieba.analyse.textrank(doc, topK=10, withWeight=False)
print('中心思想:')
for keyword in keywords:
for sentence in sentences:
if keyword in sentence:
print(sentence.strip()) # 输出包含关键词的句子
阅读全文