import math import pdb import pprint import numpy as np import pandas as pd from elmoformanylangs import Embedder e = Embedder('../../model/zhs.model') df = pd.read_excel(r'../Result/医生分词结果词性表.xlsx', sheet_name='Sheet1') sents = df['word_list'].fillna('').tolist() elmo_vecs = e.sents2elmo(sents, output_layer=-1) # 计算每个句子与第3个句子的余弦相似度,得到一个列表,存储所有的余弦相似度 target_vec = elmo_vecs[0] sims = [] for vec in elmo_vecs: vec.mean(axis=0) if isinstance(vec, float) and math.isnan(vec): sims.append('') else: sims.append(np.dot(vec.mean(axis=0), target_vec.mean(axis=0)) / ( np.linalg.norm(vec.mean(axis=0)) * np.linalg.norm(target_vec.mean(axis=0)))) # 将余弦相似度添加到原DataFrame中,并将结果保存为Excel文件 df['similarity'] = sims df.to_excel('../Result/余弦相似度结果.xlsx', sheet_name='Sheet1', index=False) # import numpy as np # import pandas as pd # from elmoformanylangs import Embedder # from sklearn.preprocessing import normalize # # e = Embedder('../model/zhs.model') # # sents = [ # ['这个', '菜', '很', '好吃', ',', '口感', '很', '棒', '。'], # ['这道', '菜', '味道', '非常', '好', ',', '咬', '一口', '就', '能', '感受到', '浓郁', '的', '香气', '。']] # # the list of lists which store the sentences # # after segment if necessary. # # # 词向量 # ch_em = e.sents2elmo(sents, output_layer=-1) # print(ch_em) # # print(vecs) # 打印elmo模型出来的词向量 # # print(vecs[0].shape) # 打印第一个句子的elmo词向量形状 # # print(vecs[1].shape) # 打印第二个句子的elmo词向量形状 # # # 将句子向量进行加权平均,得到两个文本的向量表示 # vec1 = np.mean(ch_em[0], axis=0) # vec2 = np.mean(ch_em[1], axis=0) # # # # 对文本向量进行 L2 归一化 # # vec1 = normalize(vec1.reshape(1, -1))[0] # # vec2 = normalize(vec2.reshape(1, -1))[0] # # # 计算两个文本向量的余弦相似度 # similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) # # print(similarity)