import math import numpy as np import pandas as pd from elmoformanylangs import Embedder from sklearn.metrics.pairwise import cosine_similarity e = Embedder('../../model/zhs.model') # 相似度目标文本 df = pd.read_excel(r'../Result/医生分词结果词性表.xlsx', sheet_name='Sheet1') sents = df['word_list'].fillna('').tolist() elmo_vecs = e.sents2elmo(sents, output_layer=-1) # print(elmo_vecs) # 词性,词索引 df1 = pd.read_excel(r'../Result/医生分词结果词性表.xlsx', sheet_name='Sheet1') # 定义权重字典 weight_dict = {'v': 1.2, 'a': 0.8} # 字符向量转词向量,并处理权重 word_vecs_plus = [] for i, char_vecs in enumerate(elmo_vecs): # 定位 pos_list = eval(df1.loc[i, 'pos_list']) pos_index = eval(df1.loc[i, 'pos_index']) words = [] for j, (start, end) in enumerate(pos_index): word_vec = sum(char_vecs[start:end]) / (end - start) # average character vectors pos = pos_list[j] if pos in weight_dict: weight = weight_dict[pos] word_vec *= weight word = {'vector': word_vec} words.append(word) word_vecs_plus.append(words) print(word_vecs_plus) # 格式转换 word_vecs_plus_fix = [] for n in word_vecs_plus: my_array = np.zeros((0, 1024)) for m in n: vector = m['vector'] my_array = np.vstack([my_array, vector]) word_vecs_plus_fix.append(my_array) # 计算每个句子与第3个句子的余弦相似度,得到一个列表,存储所有的余弦相似度 target_vec = word_vecs_plus_fix[0] sims = [] for vec in word_vecs_plus_fix: vec.mean(axis=0) if isinstance(vec, float) and math.isnan(vec): sims.append('') else: sims.append(np.dot(vec.mean(axis=0), target_vec.mean(axis=0)) / ( np.linalg.norm(vec.mean(axis=0)) * np.linalg.norm(target_vec.mean(axis=0)))) # 将余弦相似度添加到原DataFrame中,并将结果保存为Excel文件 df['similarity'] = sims df.to_excel('../Result/余弦相似度结果Plus.xlsx', sheet_name='Sheet1', index=False)