from sentence_transformers import SentenceTransformer import scipy.spatial import pandas as pd import time start_time = time.time() embedder = SentenceTransformer('bert-base-chinese') df = pd.read_excel('../Data/心血管相似患者实验.xlsx', sheet_name='Sheet1') corpus = df['title'].to_numpy() corpus_embeddings = embedder.encode(corpus) # 待查询的句子 queries = ['高血压患者能吃党参吗?'] query_embeddings = embedder.encode(queries) closest_n = 50 # 对于每个句子,使用余弦相似度查询最接近的个句子closest n = 2 for query, query_embedding in zip(queries, query_embeddings): distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0] # 按照距离逆序 results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("====================================") print("Query:", query) print("Result:Top 2000 most similar sentences in corpus:") # for idx, distance in results[0:closest_n]: # print(corpus[idx].strip(), "(Score: %.4f)" % (1 - distance)) for idx, distance in results[0:closest_n]: print(f"idx: {df.iloc[idx]['idx']}, sentence: {corpus[idx].strip()} (Score: {1 - distance:.4f})") end_time = time.time() print("代码运行时间为:", end_time - start_time, "秒")