import os
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from elmoformanylangs import Embedder
from sklearn.preprocessing import normalize

e = Embedder('../model/zhs.model')

sents = [
         ['这个', '菜', '很', '好吃', '，', '口感', '很', '棒', '。'],
         ['这道', '菜', '味道', '非常', '好', '，', '咬', '一口', '就', '能', '感受到', '浓郁', '的', '香气', '。']]
# the list of lists which store the sentences
# after segment if necessary.

# 词向量
ch_em = e.sents2elmo(sents, output_layer=-1)
print(ch_em)
# print(vecs) # 打印elmo模型出来的词向量
# print(vecs[0].shape) # 打印第一个句子的elmo词向量形状
# print(vecs[1].shape) # 打印第二个句子的elmo词向量形状

# 将句子向量进行加权平均，得到两个文本的向量表示
vec1 = np.mean(ch_em[0], axis=0)
vec2 = np.mean(ch_em[1], axis=0)

# # 对文本向量进行 L2 归一化
# vec1 = normalize(vec1.reshape(1, -1))[0]
# vec2 = normalize(vec2.reshape(1, -1))[0]

# 计算两个文本向量的余弦相似度
similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

print(similarity)
# will return a list of numpy arrays
# each with the shape=(seq_len, embedding_size)


# import numpy as np
# from elmoformanylangs import Embedder
# from sklearn.metrics.pairwise import cosine_similarity
#
# import jieba
#
# e = Embedder('../model/zhs.model')
# # sents1 = ["发烧"]
# # sents2 = ["生病"]
# # seg = pkuseg.pkuseg()
# # sents = [seg.cut(sent) for sent in sents]
#
# # print(sents1, sents2)
# # embeddings1 = e.sents2elmo(sents1)
# # embeddings2 = e.sents2elmo(sents2)
#
# # doc_vector1 = np.mean(embeddings1, axis=0)
# # doc_vector2 = np.mean(embeddings2, axis=0)
# sents = ["今天天气真好啊", "潮水退了就知道谁没穿裤子"]
# sents = [jieba.cut(sent) for sent in sents]
# print(sents)
# embeddings = e.sents2elmo(sents)
# print(embeddings)
# #
# # # 计算余弦相似度
# # similarity = cosine_similarity(doc_vector1.reshape(1, -1), doc_vector2.reshape(1, -1))
# #
# # print("相似度：", similarity)