import pandas as pd import numpy as np import jieba from gensim.models import Word2Vec from sklearn.metrics.pairwise import cosine_similarity import time start_time = time.time() # 相似度 word2vec+余弦相似度 # Load data from Excel df = pd.read_excel('../Data/心血管相似患者实验.xlsx', sheet_name='Sheet1') # Tokenize text column df['answer'] = df['title'].apply(lambda x: list(jieba.cut(x))) # Train Word2Vec model 词向量维度,窗口大小,最低词频值,线程数 # , sg=1 skip-gram模型,默认CBOW模型 model = Word2Vec(df['answer'], vector_size=20, window=5, min_count=1, workers=4) # print('髋关节的词向量:', model.wv['髋关节']) # print('髋关节相似的三个词向量:', model.wv.most_similar('髋关节', topn=3)) # Vectorize text column df['answer'] = df['answer'].apply(lambda x: np.mean([model.wv[word] for word in x], axis=0)) # Calculate cosine similarity with input text input_text = "高血压患者能吃党参吗?" input_text_vector = np.mean([model.wv[word] for word in jieba.cut(input_text)], axis=0) similarity_scores = cosine_similarity([input_text_vector], df['answer'].tolist())[0] # Add similarity scores to DataFrame df['similarity'] = similarity_scores # Sort DataFrame by similarity scores and select top 20 top_20 = df.sort_values(by=['similarity'], ascending=False)[:2000] # Print results print(top_20[['idx', 'title', 'similarity']]) end_time = time.time() print("代码运行时间为:", end_time - start_time, "秒")