1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np
import jieba
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import time
start_time = time.time()
# 相似度 word2vec+余弦相似度
# Load data from Excel
df = pd.read_excel('../Data/心血管相似患者实验.xlsx', sheet_name='Sheet1')
# Tokenize text column
df['answer'] = df['title'].apply(lambda x: list(jieba.cut(x)))
# Train Word2Vec model 词向量维度,窗口大小,最低词频值,线程数
# , sg=1 skip-gram模型,默认CBOW模型
model = Word2Vec(df['answer'], vector_size=20, window=5, min_count=1, workers=4)
# print('髋关节的词向量:', model.wv['髋关节'])
# print('髋关节相似的三个词向量:', model.wv.most_similar('髋关节', topn=3))
# Vectorize text column
df['answer'] = df['answer'].apply(lambda x: np.mean([model.wv[word] for word in x], axis=0))
# Calculate cosine similarity with input text
input_text = "高血压患者能吃党参吗?"
input_text_vector = np.mean([model.wv[word] for word in jieba.cut(input_text)], axis=0)
similarity_scores = cosine_similarity([input_text_vector], df['answer'].tolist())[0]
# Add similarity scores to DataFrame
df['similarity'] = similarity_scores
# Sort DataFrame by similarity scores and select top 20
top_20 = df.sort_values(by=['similarity'], ascending=False)[:2000]
# Print results
print(top_20[['idx', 'title', 'similarity']])
end_time = time.time()
print("代码运行时间为:", end_time - start_time, "秒")