import math
import pdb
import pprint

import numpy as np
import pandas as pd
from elmoformanylangs import Embedder

e = Embedder('../../model/zhs.model')

df = pd.read_excel(r'../Result/医生分词结果词性表.xlsx', sheet_name='Sheet1')
sents = df['word_list'].fillna('').tolist()

elmo_vecs = e.sents2elmo(sents, output_layer=-1)

# 计算每个句子与第3个句子的余弦相似度，得到一个列表，存储所有的余弦相似度
target_vec = elmo_vecs[0]
sims = []
for vec in elmo_vecs:
    vec.mean(axis=0)
    if isinstance(vec, float) and math.isnan(vec):
        sims.append('')
    else:
        sims.append(np.dot(vec.mean(axis=0), target_vec.mean(axis=0)) / (
                np.linalg.norm(vec.mean(axis=0)) * np.linalg.norm(target_vec.mean(axis=0))))

# 将余弦相似度添加到原DataFrame中，并将结果保存为Excel文件
df['similarity'] = sims
df.to_excel('../Result/余弦相似度结果.xlsx', sheet_name='Sheet1', index=False)

# import numpy as np
# import pandas as pd
# from elmoformanylangs import Embedder
# from sklearn.preprocessing import normalize
#
# e = Embedder('../model/zhs.model')
#
# sents = [
#          ['这个', '菜', '很', '好吃', '，', '口感', '很', '棒', '。'],
#          ['这道', '菜', '味道', '非常', '好', '，', '咬', '一口', '就', '能', '感受到', '浓郁', '的', '香气', '。']]
# # the list of lists which store the sentences
# # after segment if necessary.
#
# # 词向量
# ch_em = e.sents2elmo(sents, output_layer=-1)
# print(ch_em)
# # print(vecs) # 打印elmo模型出来的词向量
# # print(vecs[0].shape) # 打印第一个句子的elmo词向量形状
# # print(vecs[1].shape) # 打印第二个句子的elmo词向量形状
#
# # 将句子向量进行加权平均，得到两个文本的向量表示
# vec1 = np.mean(ch_em[0], axis=0)
# vec2 = np.mean(ch_em[1], axis=0)
#
# # # 对文本向量进行 L2 归一化
# # vec1 = normalize(vec1.reshape(1, -1))[0]
# # vec2 = normalize(vec2.reshape(1, -1))[0]
#
# # 计算两个文本向量的余弦相似度
# similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
#
# print(similarity)