import pandas as pd import numpy as np import jieba from gensim.models import Word2Vec from sklearn.metrics.pairwise import cosine_similarity # 相似度 word2vec+余弦相似度 # Load data from Excel df = pd.read_excel('骨科回答处理.xlsx', sheet_name='Sheet2') # Tokenize text column df['answer'] = df['answer'].apply(lambda x: list(jieba.cut(x))) # Train Word2Vec model 词向量维度,窗口大小,最低词频值,线程数 # , sg=1 skip-gram模型,默认CBOW模型 model = Word2Vec(df['answer'], vector_size=20, window=5, min_count=1, workers=4) print('髋关节的词向量:', model.wv['髋关节']) print('髋关节相似的三个词向量:', model.wv.most_similar('髋关节', topn=3)) # Vectorize text column df['answer'] = df['answer'].apply(lambda x: np.mean([model.wv[word] for word in x], axis=0)) # Calculate cosine similarity similarity_matrix = cosine_similarity(df['answer'].tolist()) df['similarity'] = [row.tolist() for row in similarity_matrix] # Print results print(similarity_matrix) print("+++++++++++++++++++") # Convert similarity matrix to DataFrame similarity_df = pd.DataFrame(similarity_matrix, columns=df.index, index=df.index) print(similarity_df) # Write DataFrame to Excel # similarity_df.to_excel('../datas/xsd.xlsx', sheet_name='xsd') # 找出similarity_df中前5个最大的值 # top_20 = similarity_df.stack().nlargest(20) # print(top_20) # 找出similarity_df中除了对角线以外的前20个最大值 top_20 = similarity_df.mask(np.eye(len(df), dtype=bool)).stack().nlargest(20) print(top_20)