DocSimElmoPlus.py 2.03 KB
import math
import numpy as np
import pandas as pd
from elmoformanylangs import Embedder
from sklearn.metrics.pairwise import cosine_similarity

e = Embedder('../../model/zhs.model')

# 相似度目标文本
df = pd.read_excel(r'../Result/医生分词结果词性表.xlsx', sheet_name='Sheet1')
sents = df['word_list'].fillna('').tolist()

elmo_vecs = e.sents2elmo(sents, output_layer=-1)

# print(elmo_vecs)

# 词性,词索引
df1 = pd.read_excel(r'../Result/医生分词结果词性表.xlsx', sheet_name='Sheet1')

# 定义权重字典
weight_dict = {'v': 1.2, 'a': 0.8}

# 字符向量转词向量,并处理权重
word_vecs_plus = []
for i, char_vecs in enumerate(elmo_vecs):
    # 定位
    pos_list = eval(df1.loc[i, 'pos_list'])
    pos_index = eval(df1.loc[i, 'pos_index'])
    words = []
    for j, (start, end) in enumerate(pos_index):
        word_vec = sum(char_vecs[start:end]) / (end - start)  # average character vectors
        pos = pos_list[j]
        if pos in weight_dict:
            weight = weight_dict[pos]
            word_vec *= weight
        word = {'vector': word_vec}
        words.append(word)
    word_vecs_plus.append(words)

print(word_vecs_plus)

# 格式转换
word_vecs_plus_fix = []
for n in word_vecs_plus:
    my_array = np.zeros((0, 1024))
    for m in n:
        vector = m['vector']
        my_array = np.vstack([my_array, vector])
    word_vecs_plus_fix.append(my_array)

# 计算每个句子与第3个句子的余弦相似度,得到一个列表,存储所有的余弦相似度
target_vec = word_vecs_plus_fix[0]
sims = []
for vec in word_vecs_plus_fix:
    vec.mean(axis=0)
    if isinstance(vec, float) and math.isnan(vec):
        sims.append('')
    else:
        sims.append(np.dot(vec.mean(axis=0), target_vec.mean(axis=0)) / (
                np.linalg.norm(vec.mean(axis=0)) * np.linalg.norm(target_vec.mean(axis=0))))

# 将余弦相似度添加到原DataFrame中,并将结果保存为Excel文件
df['similarity'] = sims
df.to_excel('../Result/余弦相似度结果Plus.xlsx', sheet_name='Sheet1', index=False)