import os import pickle from sklearn.metrics.pairwise import cosine_similarity import numpy as np import pandas as pd from elmoformanylangs import Embedder from sklearn.preprocessing import normalize e = Embedder('../model/zhs.model') sents = [ ['这个', '菜', '很', '好吃', ',', '口感', '很', '棒', '。'], ['这道', '菜', '味道', '非常', '好', ',', '咬', '一口', '就', '能', '感受到', '浓郁', '的', '香气', '。']] # the list of lists which store the sentences # after segment if necessary. # 词向量 ch_em = e.sents2elmo(sents, output_layer=-1) print(ch_em) # print(vecs) # 打印elmo模型出来的词向量 # print(vecs[0].shape) # 打印第一个句子的elmo词向量形状 # print(vecs[1].shape) # 打印第二个句子的elmo词向量形状 # 将句子向量进行加权平均,得到两个文本的向量表示 vec1 = np.mean(ch_em[0], axis=0) vec2 = np.mean(ch_em[1], axis=0) # # 对文本向量进行 L2 归一化 # vec1 = normalize(vec1.reshape(1, -1))[0] # vec2 = normalize(vec2.reshape(1, -1))[0] # 计算两个文本向量的余弦相似度 similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) print(similarity) # will return a list of numpy arrays # each with the shape=(seq_len, embedding_size) # import numpy as np # from elmoformanylangs import Embedder # from sklearn.metrics.pairwise import cosine_similarity # # import jieba # # e = Embedder('../model/zhs.model') # # sents1 = ["发烧"] # # sents2 = ["生病"] # # seg = pkuseg.pkuseg() # # sents = [seg.cut(sent) for sent in sents] # # # print(sents1, sents2) # # embeddings1 = e.sents2elmo(sents1) # # embeddings2 = e.sents2elmo(sents2) # # # doc_vector1 = np.mean(embeddings1, axis=0) # # doc_vector2 = np.mean(embeddings2, axis=0) # sents = ["今天天气真好啊", "潮水退了就知道谁没穿裤子"] # sents = [jieba.cut(sent) for sent in sents] # print(sents) # embeddings = e.sents2elmo(sents) # print(embeddings) # # # # # 计算余弦相似度 # # similarity = cosine_similarity(doc_vector1.reshape(1, -1), doc_vector2.reshape(1, -1)) # # # # print("相似度:", similarity)