【python 走进NLP】文本相似度计算--余弦相似度
余弦相似度,又称为余弦相似性,是通过计算两个向量的夹角余弦值来评估他们的相似度。余弦相似度将向量根据坐标值,绘制到向量空间中,如最常见的二维空间。
# -*- coding: utf-8 -*-
import jieba
import numpy as np
def get_word_vector(s1,s2):
"""
:param s1: 句子1
:param s2: 句子2
:return: 返回句子的余弦相似度
"""
# 分词
cut1 = jieba.cut(s1)
cut2 = jieba.cut(s2)
list_word1 = (','.join(cut1)).split(',')
list_word2 = (','.join(cut2)).split(',')
# 列出所有的词,取并集
key_word = list(set(list_word1 + list_word2))
# 给定形状和类型的用0填充的矩阵存储向量
word_vector1 = np.zeros(len(key_word))
word_vector2 = np.zeros(len(key_word))
# 计算词频
# 依次确定向量的每个位置的值
for i in range(len(key_word)):
# 遍历key_word中每个词在句子中的出现次数
for j in range(len(list_word1)):
if key_word[i] == list_word1[j]:
word_vector1[i] += 1
for k in range(len(list_word2)):
if key_word[i] == list_word2[k]:
word_vector2[i] += 1
# 输出向量
print(word_vector1)
print(word_vector2)
return word_vector1, word_vector2
def cos_dist(vec1,vec2):
"""
:param vec1: 向量1
:param vec2: 向量2
:return: 返回两个向量的余弦相似度
"""
dist1=float(np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
return dist1
if __name__ == '__main__':
s1="这只皮靴号码大了。那只号码合适"
s2="这只皮靴号码不小,那只更合适"
vec1,vec2=get_word_vector(s1,s2)
dist1=cos_dist(vec1,vec2)
print(dist1)
运行结果:
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\xiaohu\AppData\Local\Temp\jieba.cache
Loading model cost 0.821 seconds.
Prefix dict has been built succesfully.
[1. 1. 2. 2. 1. 1. 1. 1. 0. 0. 1. 0.]
[0. 0. 2. 1. 1. 0. 1. 1. 1. 1. 1. 1.]
0.74535599249993