1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import math
import numpy as np
import pandas as pd
from elmoformanylangs import Embedder
from sklearn.metrics.pairwise import cosine_similarity
e = Embedder('../../model/zhs.model')
# 相似度目标文本
df = pd.read_excel(r'../Result/医生分词结果词性表.xlsx', sheet_name='Sheet1')
sents = df['word_list'].fillna('').tolist()
elmo_vecs = e.sents2elmo(sents, output_layer=-1)
# print(elmo_vecs)
# 词性,词索引
df1 = pd.read_excel(r'../Result/医生分词结果词性表.xlsx', sheet_name='Sheet1')
# 定义权重字典
weight_dict = {'v': 1.2, 'a': 0.8}
# 字符向量转词向量,并处理权重
word_vecs_plus = []
for i, char_vecs in enumerate(elmo_vecs):
# 定位
pos_list = eval(df1.loc[i, 'pos_list'])
pos_index = eval(df1.loc[i, 'pos_index'])
words = []
for j, (start, end) in enumerate(pos_index):
word_vec = sum(char_vecs[start:end]) / (end - start) # average character vectors
pos = pos_list[j]
if pos in weight_dict:
weight = weight_dict[pos]
word_vec *= weight
word = {'vector': word_vec}
words.append(word)
word_vecs_plus.append(words)
print(word_vecs_plus)
# 格式转换
word_vecs_plus_fix = []
for n in word_vecs_plus:
my_array = np.zeros((0, 1024))
for m in n:
vector = m['vector']
my_array = np.vstack([my_array, vector])
word_vecs_plus_fix.append(my_array)
# 计算每个句子与第3个句子的余弦相似度,得到一个列表,存储所有的余弦相似度
target_vec = word_vecs_plus_fix[0]
sims = []
for vec in word_vecs_plus_fix:
vec.mean(axis=0)
if isinstance(vec, float) and math.isnan(vec):
sims.append('')
else:
sims.append(np.dot(vec.mean(axis=0), target_vec.mean(axis=0)) / (
np.linalg.norm(vec.mean(axis=0)) * np.linalg.norm(target_vec.mean(axis=0))))
# 将余弦相似度添加到原DataFrame中,并将结果保存为Excel文件
df['similarity'] = sims
df.to_excel('../Result/余弦相似度结果Plus.xlsx', sheet_name='Sheet1', index=False)