中文本文实现多文本相似度计算的代码
时间: 2024-03-03 13:53:49 浏览: 115
中文文本相似度匹配算法
以下是一个简单的中文文本相似度计算的代码实现,使用了jieba分词和余弦相似度计算方法:
```python
import jieba
import numpy as np
def cosine_similarity(vector1, vector2):
dot_product = np.dot(vector1, vector2)
norm1 = np.linalg.norm(vector1)
norm2 = np.linalg.norm(vector2)
return dot_product / (norm1 * norm2)
def get_word_vector(text):
words = jieba.lcut(text)
word_vector = {}
for word in words:
if word not in word_vector:
word_vector[word] = 1
else:
word_vector[word] += 1
return word_vector
def get_text_vector(text, word_vector):
words = jieba.lcut(text)
text_vector = []
for word in word_vector:
if word in words:
text_vector.append(words.count(word))
else:
text_vector.append(0)
return np.array(text_vector)
def text_similarity(text1, text2):
word_vector1 = get_word_vector(text1)
word_vector2 = get_word_vector(text2)
word_set = set(word_vector1.keys()) | set(word_vector2.keys())
text_vector1 = get_text_vector(text1, word_set)
text_vector2 = get_text_vector(text2, word_set)
return cosine_similarity(text_vector1, text_vector2)
```
使用方法:
```python
text1 = "这是一段测试文本1"
text2 = "这是一段测试文本2"
similarity = text_similarity(text1, text2)
print(similarity)
```
输出结果为两个文本的相似度。
阅读全文