import pandas as pd import numpy as np import jieba from gensim.models import Word2Vec from sklearn.metrics.pairwise import cosine_similarity # 相似度 word2vec+余弦相似度 # Load data from Excel df = pd.read_excel('../datas/hebing.xlsx', sheet_name='Sheet1') # Tokenize text column df['合并咨询文本'] = df['合并咨询文本'].apply(lambda x: list(jieba.cut(x))) # Train Word2Vec model 词向量维度,窗口大小,最低词频值,线程数 # , sg=1 skip-gram模型,默认CBOW模型 model = Word2Vec(df['合并咨询文本'], vector_size=50, window=5, min_count=1, workers=4) # print('恶心的词向量:', model.wv['恶心']) # print('恶心相似的三个词向量:', model.wv.most_similar('恶心', topn=3)) # Vectorize text column df['合并咨询文本'] = df['合并咨询文本'].apply(lambda x: np.mean([model.wv[word] for word in x], axis=0)) # Calculate cosine similarity similarity_matrix = cosine_similarity(df['合并咨询文本'].tolist()) df['similarity'] = [row.tolist() for row in similarity_matrix] # Print results print(similarity_matrix) print("+++++++++++++++++++") # Convert similarity matrix to DataFrame similarity_df = pd.DataFrame(similarity_matrix, columns=df.index, index=df.index) print(similarity_df) # Write DataFrame to Excel # similarity_df.to_excel('../datas/xsd.xlsx', sheet_name='xsd') # 找出similarity_df中前5个最大的值 top_20 = similarity_df.stack().nlargest(20) print(top_20)