import pandas as pd import DocSimElmoPlus0516 import DotProSim import PatSimBert0516 import handler def calculate_similarity(doctors): # 统计医生名称的出现次数 name_counts = {} for doctor in doctors: name = doctor['医生名称'] if name in name_counts: name_counts[name] += 1 else: name_counts[name] = 1 # 选出大于2次的医生名称 selected_names = [name for name, count in name_counts.items() if count > 1] # 计算医生对应的相似度 similarity_scores = {} for doctor in doctors: name = doctor['医生名称'] score = doctor['相似度得分'] if name in selected_names: if doctor['医生集'] == 'A': score *= 0.6 elif doctor['医生集'] == 'B': score *= 0.3 elif doctor['医生集'] == 'C': score *= 0.1 if name in similarity_scores: similarity_scores[name] += score else: similarity_scores[name] = score # 按相似度降序排序 sorted_results = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True) # 仅保留前十个医生 return sorted_results[:10] # return sorted_results def calculate_precision(candidates, test_data): total_tests = len(test_data) matched_tests = 0 for i in range(total_tests): test = test_data[i] for doctor in test: for candidate in candidates[i]: if doctor == candidate[0]: matched_tests += 1 break precision = matched_tests / total_tests if total_tests > 0 else 0 return precision def calculate_average_precision(candidate_set, test_set): total_precision = 0.0 relevant_count = 0 for i in range(len(test_set)): query = test_set[i] candidates = candidate_set[i] position = 0 for j in range(len(candidates)): position += 1 if query in candidates[j]: precision = 1.0 / position total_precision += precision relevant_count += 1 break if relevant_count > 0: return total_precision / relevant_count else: return 0.0 def calculate_coverage(candidate_sets): total_coverage = 0.0 candidate_count = 0 for candidates in candidate_sets: distinct_doctors = set(candidate['医生名称'] for candidate in candidates) coverage = len(distinct_doctors) / 229 total_coverage += coverage candidate_count += 1 if candidate_count > 0: return total_coverage / candidate_count else: return 0.0 def main(): # 测试集 # queries = [ # "不拉不尿,吃啥吐啥,喝啥也吐。", # "晚餐不消化,腹胀还放屁" # ] # 读取Excel文件 df = pd.read_excel('../data/导医测试数据.xlsx', sheet_name='Sheet1') # 提取"ask"列数据作为查询列表 queries = df['ask'].tolist() results_s = [] # 存储所有医生推荐结果的数组 results1_s = [] # 存储所有医生推荐前结果的数组 for query in queries: ''' 相似问题 ''' print(f"咨询问题:{query}") # output_values 前50个相似咨询 output_values = PatSimBert0516.process_data(query) # 存储不同的sentence文本的集合 unique_sentences = set() # 遍历输出值列表并进行后续操作 for output_struct in output_values: # 排除与查询值相同的数据 if output_struct.sentence == query: continue # 将不同的sentence文本添加到集合中 unique_sentences.add(output_struct.sentence) # 检查是否已经找到了五种不同的sentence文本 if len(unique_sentences) == 5: break # 存储与unique_sentences中的sentence相同的数据 # matching_data 前5种相似咨询* matching_data = [] # 遍历输出值列表并进行后续操作 for output_struct in output_values: # 检查当前output_struct的sentence是否与unique_sentences中的值相同 if output_struct.sentence in unique_sentences: # 将匹配的数据添加到matching_data列表中 matching_data.append(output_struct) results = [] # 存储某一个医生推荐结果的数组 print("========================医生集A=============================") for dataA in matching_data: print(f"医生集A: name: {dataA.name}, Score: {dataA.score:.4f}") result = { '医生集': 'A', '医生名称': dataA.name, '相似度得分': dataA.score } results.append(result) # 找出已采纳的idx min_indices = handler.find_minimum_idx(matching_data) print(f"已采纳的回答idx: {min_indices}") ''' 相似回答 ''' target_indices = [x - 1 for x in min_indices] # 日期 output_path = '0516' cosine_similarity = DocSimElmoPlus0516.calculate_cosine_similarity(target_indices, output_path) # 输出 results.append(cosine_similarity) ''' 相似医生 ''' doctor_idxs = [] for target_indice in min_indices: doctor_idx = handler.find_doctor_id_by_idx(target_indice) doctor_idxs.append(doctor_idx) similar_doctors = DotProSim.find_top_similar_doctors(doctor_idxs) # 打印结果 print("================3333333==============") for doctor in similar_doctors: idx = doctor['idx'] # print(f"医生 {idx} 的相似医生:") for similar_idx, similarity in doctor['similarities']: name_by_idx = handler.find_doctor_name_by_idx(similar_idx) name_doctor = name_by_idx[0] # print(f"医生集C: name: {name_by_idx}, Score: {similarity:.4f}") result = { '医生集': 'C', '医生名称': name_doctor, '相似度得分': similarity } results.append(result) print() # results_1,未处理候选医生集 def flatten_list(lst): flattened = [] for item in lst: if isinstance(item, list): flattened.extend(flatten_list(item)) else: flattened.append(item) return flattened results_1 = flatten_list(results) results1_s.append(results_1) print("----------自动化0624----------") # 输出结果数组 for result in results_1: print(f"医生集: {result['医生集']}, 医生名称: {result['医生名称']}, 相似度得分: {result['相似度得分']}") ''' 1.处理候选医生集 数组中相同的医生数量>2进入计算 A 0.6,B 0.3,C 0.1 = 候选集医生,排名(医生,相似度) 2. 计算指标 2.1 精准度 判断测试数据的的医生是否在候选医生集中,在1不在0,得出测试集的精准度 2.2 平均精准度 实际有的医生,每一个医生的位置1/位置,平均精度相加*(1/n) 2.3 覆盖率 ABC集合中的医生涉及到的医生数目m,m/229 ''' # results_2,输出获选医生结果 results_2 = calculate_similarity(results_1) for result in results_2: name = result[0] similarity = result[1] print(f"医生名称: {name}, 相似度: {similarity}") print("-----end0625------") results_s.append(results_2) print("-----end0625------") with open('results_s.txt', 'w', encoding='utf-8') as file: for result in results_s: result_str = ' '.join(str(item) for item in result) file.write(result_str + '\n') # 测试集 excel_file = '../data/导医测试数据.xlsx' # sheet_name = 'Sheet1' sheet_name = 'Sheet1' df = pd.read_excel(excel_file, sheet_name=sheet_name) test_data = df['doctor'].tolist() questions = df['ask'].tolist() test_data1 = [[doctor] for doctor in test_data] # 精准度 precision = calculate_precision(results_s, test_data1) print(f"测试集的准确率: {precision:.2%}") # 平均精准度 average_precision = calculate_average_precision(results_s, test_data) print(f"平均精准度: {average_precision:.2%}") # 覆盖率 coverage = calculate_coverage(results1_s) print("医生覆盖率: {:.2%}".format(coverage)) if __name__ == "__main__": main()