from transformers import AutoTokenizer, AutoModel import torch import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler class OutputQStruct: def __init__(self, idx, name, sentence, score): self.idx = idx self.name = name self.sentence = sentence self.score = score def process_data(query): # 创建一个空的输出值列表 output_values = [] # 加载BERT模型和tokenizer tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') model = AutoModel.from_pretrained('bert-base-chinese') # 加载数据 df = pd.read_excel('../../data-dev/消化内科对话_已解决问题0510.xlsx', sheet_name='对话数据+') corpus = df['标题'].tolist() # 将所有文本向量化 encoded_dict = tokenizer(corpus, padding=True, truncation=True, max_length=32, return_tensors='pt') input_ids = encoded_dict['input_ids'] attention_mask = encoded_dict['attention_mask'] with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask) embeddings = outputs.last_hidden_state[:, 0, :].numpy() query_encoded = tokenizer.encode_plus( query, add_special_tokens=True, max_length=32, padding='longest', truncation=True, return_tensors='pt') query_input_ids = query_encoded['input_ids'] query_attention_mask = query_encoded['attention_mask'] with torch.no_grad(): query_output = model(query_input_ids, attention_mask=query_attention_mask) query_embedding = query_output.last_hidden_state[:, 0, :].numpy() similarities = np.inner(query_embedding, embeddings) similarities = similarities.squeeze() # 移除不必要的维度 scaler = MinMaxScaler() similarities = scaler.fit_transform(similarities.reshape(-1, 1)).squeeze() # 归一化到[0,1] top200_idx = similarities.argsort()[::-1][:50] for idx in top200_idx: output_struct = OutputQStruct(df.iloc[idx]['idx'], df.iloc[idx]['医生姓名'], corpus[idx].strip(), similarities[idx]) output_values.append(output_struct) # 返回输出值列表 return output_values