from transformers import AutoTokenizer, AutoModel import torch import numpy as np import pandas as pd import time start_time = time.time() # 加载BERT模型和tokenizer tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') model = AutoModel.from_pretrained('bert-base-chinese') # 加载数据 df = pd.read_excel('../Data/心血管相似患者实验.xlsx', sheet_name='Sheet1') corpus = df['title'].tolist() # 将所有文本向量化 encoded_dict = tokenizer(corpus, padding=True, truncation=True, max_length=32, return_tensors='pt') input_ids = encoded_dict['input_ids'] attention_mask = encoded_dict['attention_mask'] with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask) embeddings = outputs.last_hidden_state[:, 0, :].numpy() # 计算输入文本与所有文本向量的相似度,并选取相似度最高的前200个文本 query = "高血压患者能吃党参吗?" query_encoded = tokenizer.encode_plus( query, add_special_tokens=True, max_length=32, padding='longest', truncation=True, return_tensors='pt') query_input_ids = query_encoded['input_ids'] query_attention_mask = query_encoded['attention_mask'] with torch.no_grad(): query_output = model(query_input_ids, attention_mask=query_attention_mask) query_embedding = query_output.last_hidden_state[:, 0, :].numpy() similarities = np.inner(query_embedding, embeddings) top200_idx = np.argsort(similarities, axis=1)[:, ::-1][:, :50].squeeze() for idx in top200_idx: print(f"idx: {df.iloc[idx]['idx']}, sentence: {corpus[idx].strip()} (Score: %.4f)" % similarities[0][idx]) end_time = time.time() print("代码运行时间为:", end_time - start_time, "秒") # 两个句子使用bert相似度 # from transformers import AutoTokenizer, AutoModel # import torch # import numpy as np # # # 加载BERT模型和tokenizer # tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') # model = AutoModel.from_pretrained('bert-base-chinese') # # # 分词并转换为对应的BERT词汇ID # sentence1 = "双腿肿,医生说是心肌缺血" # sentence2 = "扩张型心肌病患者得病5年腹部大,下肢浮肿" # # encoded_dict = tokenizer.encode_plus( # sentence1, # 第一个句子 # sentence2, # 第二个句子 # add_special_tokens=True, # 添加特殊标记(CLS和SEP) # max_length=32, # 截断长度 # padding='longest', # 填充到最大长度 # truncation=True, # 截断句子 # return_attention_mask=True, # 生成注意力掩码 # return_tensors='pt', # 返回pytorch张量格式 # ) # # # 生成BERT的输入格式,包括输入ID和attention mask # input_ids = encoded_dict['input_ids'] # attention_mask = encoded_dict['attention_mask'] # # # 输入模型进行预测并计算相似度 # with torch.no_grad(): # outputs = model(input_ids, attention_mask=attention_mask) # # last_hidden_states = outputs[0].squeeze(0)[:, 0, :] # 取CLS特征向量 # last_hidden_states = outputs[0][:, 0, :] # 取CLS特征向量 # similarity = np.inner(last_hidden_states[0], last_hidden_states[0]) # 计算相似度 # # print("相似度:", similarity)