import pandas as pd
import numpy as np
import jieba
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# 相似度 word2vec+余弦相似度
# Load data from Excel
df = pd.read_excel('../datas/hebing.xlsx', sheet_name='Sheet1')

# Tokenize text column
df['合并咨询文本'] = df['合并咨询文本'].apply(lambda x: list(jieba.cut(x)))

# Train Word2Vec model 词向量维度,窗口大小,最低词频值,线程数
# , sg=1  skip-gram模型,默认CBOW模型
model = Word2Vec(df['合并咨询文本'], vector_size=50, window=5, min_count=1, workers=4)
# print('恶心的词向量:', model.wv['恶心'])
# print('恶心相似的三个词向量:', model.wv.most_similar('恶心', topn=3))

# Vectorize text column
df['合并咨询文本'] = df['合并咨询文本'].apply(lambda x: np.mean([model.wv[word] for word in x], axis=0))

# Calculate cosine similarity
similarity_matrix = cosine_similarity(df['合并咨询文本'].tolist())
df['similarity'] = [row.tolist() for row in similarity_matrix]

# Print results
print(similarity_matrix)
print("+++++++++++++++++++")

# Convert similarity matrix to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, columns=df.index, index=df.index)
print(similarity_df)

# Write DataFrame to Excel
# similarity_df.to_excel('../datas/xsd.xlsx', sheet_name='xsd')

# 找出similarity_df中前5个最大的值
top_20 = similarity_df.stack().nlargest(20)
print(top_20)