import math import numpy as np import pandas as pd from elmoformanylangs import Embedder import handler def calculate_cosine_similarity(target_indices, output_path): e = Embedder('../../model/zhs.model') # 相似度目标文本 df = pd.read_excel(r'../../data-dev/医生分词结果词性表0511.xlsx', sheet_name='Sheet1') # sents = df['word_list'].fillna('').tolist() # elmo_vecs = e.sents2elmo(sents, output_layer=-1) # 加载Elmo处理结果的文件 elmo_vecs = np.load('elmo_vectors.npy', allow_pickle=True) # 词性,词索引 df1 = pd.read_excel(r'../../data-dev/医生分词结果词性表0511.xlsx', sheet_name='Sheet1') # 定义权重字典 动词 名词 形容词 副词 连词 未知词 weight_dict = {'v': 1.2, 'n': 1.2, 'a': 0.8, 'd': 0.8, 'c': 0.6, 'u': 0.6} # 字符向量转词向量,并处理权重 word_vecs_plus = [] for i, char_vecs in enumerate(elmo_vecs): # 定位 pos_list = eval(df1.loc[i, 'pos_list']) pos_index = eval(df1.loc[i, 'pos_index']) words = [] for j, (start, end) in enumerate(pos_index): word_vec = sum(char_vecs[start:end]) / (end - start) # average character vectors pos = pos_list[j] if pos in weight_dict: weight = weight_dict[pos] word_vec *= weight word = {'vector': word_vec} words.append(word) word_vecs_plus.append(words) # 格式转换 word_vecs_plus_fix = [] for n in word_vecs_plus: my_array = np.zeros((0, 1024)) for m in n: vector = m['vector'] my_array = np.vstack([my_array, vector]) word_vecs_plus_fix.append(my_array) print("================222222==============") results = [] # 计算每个句子与第x个句子的余弦相似度 for target_index in target_indices: target_vec = word_vecs_plus_fix[target_index] sims = [] for vec in word_vecs_plus_fix: vec.mean(axis=0) if isinstance(vec, float) and math.isnan(vec): sims.append('') else: sims.append(np.dot(vec.mean(axis=0), target_vec.mean(axis=0)) / ( np.linalg.norm(vec.mean(axis=0)) * np.linalg.norm(target_vec.mean(axis=0)))) # 将余弦相似度添加到原DataFrame中,并将结果保存为Excel文件 df['similarity'] = sims # 根据相似度降序排列,并获取前五个 top_five = df.nlargest(6, 'similarity') # 提取前五个的 idx 和对应的 similarity top_five_idx = top_five['idx'].tolist() top_five_similarity = top_five['similarity'].tolist() # 输出前五个的 idx 和对应的 similarity for i in range(1, len(top_five_idx)): idx = top_five_idx[i] similarity = top_five_similarity[i] find_minimum_idx = handler.find_doctor_id_by_idx(idx) name_by_idx = handler.find_doctor_name_by_idx(find_minimum_idx) name_doctor = name_by_idx[0] # print(f"编号:{idx}, 医生集B: name: {name_by_idx}, Score: {similarity:.4f}") result = { '医生集': 'B', '医生名称': name_doctor, '相似度得分': similarity } results.append(result) return results