from gensim.utils import simple_preprocess from gensim.models import Word2Vec import pandas as pd # Load the pre-trained Word2Vec model model = Word2Vec.load('../demo/test.model') # Load the Excel file df = pd.read_excel('消化内科对话_已解决问题0510.xlsx', sheet_name='w') # Extract the title and doctor name columns data titles = df['标题'].tolist() doctors = df['医生姓名'].tolist() # Input sentences # sentences = ["请 彩超 报告 分析", "非 萎缩 胃窦炎 伴 糜烂", "脾胃 湿热 消化不良 有时候 脚 冰凉 小便 次"] # 读取Excel文件 df = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1') sentences = df['ask'].tolist() # Initialize the list of lists to store doctors doctors_lists = [[] for _ in range(len(sentences))] for i, sentence in enumerate(sentences): # Preprocess the sentence preprocessed_sentence = simple_preprocess(sentence) # Calculate similarity between the sentence and titles similarities = [] for title, doctor in zip(titles, doctors): # Convert title and doctor to string title = str(title) doctor = str(doctor) preprocessed_title = simple_preprocess(title) # Check if either list is empty if not preprocessed_sentence or not preprocessed_title: continue similarity = model.wv.n_similarity(preprocessed_sentence, preprocessed_title) # Exclude similarity of 1 if similarity == 1.0: continue similarities.append((similarity, title, doctor)) # Sort similarities in descending order similarities.sort(reverse=True) # Get the top ten similarities while excluding same titles top_similarities = [] seen_titles = set() for similarity, title, doctor in similarities: if title not in seen_titles: top_similarities.append((similarity, title, doctor)) seen_titles.add(title) if len(top_similarities) == 10: break # Store the doctors in the respective list doctors_list = [doctor for _, _, doctor in top_similarities] doctors_lists[i] = doctors_list # Print the list of lists of doctors # print("医生名称数组:", doctors_lists) test_df = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1') # Extract the "doctor" column data from the test dataset test_doctors = test_df['doctor'].tolist() # Calculate accuracy correct_predictions = 0 total_predictions = len(doctors_lists) for i, doctors in enumerate(doctors_lists): if i < len(test_doctors) and len(doctors) > 0 and test_doctors[i] in doctors: correct_predictions += 1 # 准确率 accuracy = correct_predictions / total_predictions print("准确率:", accuracy) # 测试集 excel_file = '导医测试数据.xlsx' # sheet_name = 'Sheet1' sheet_name = 'Sheet1' df = pd.read_excel(excel_file, sheet_name=sheet_name) test_data = df['doctor'].tolist() test_data1 = [[doctor] for doctor in test_data] # 平均准确率 doctors_lists是推荐医生集, # 平均精准度 def calculate_average_precision(candidate_set, test_set): total_precision = 0.0 relevant_count = 0 for i in range(len(test_set)): query = test_set[i] candidates = candidate_set[i] position = 0 for j in range(len(candidates)): position += 1 if query in candidates[j]: precision = 1.0 / position total_precision += precision relevant_count += 1 break if relevant_count > 0: return total_precision / len(candidate_set) else: return 0.0 average_precision = calculate_average_precision(doctors_lists, test_data) print(f"平均精准度: {average_precision:.2%}")