From 176a2f4ca545acf852543d05a52792fda32fa66f Mon Sep 17 00:00:00 2001 From: lichusong <2661058231@qq.com> Date: Mon, 17 Jul 2023 12:57:55 +0800 Subject: [PATCH] =?UTF-8?q?0716-END0516=E6=96=87=E4=BB=B6=E6=94=BESED?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=EF=BC=88sentence=20bert=20elmo=20...?= =?UTF-8?q?=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- END0516/Code/main.py | 268 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 END0516/Code/main.py diff --git a/END0516/Code/main.py b/END0516/Code/main.py new file mode 100644 index 0000000..a606871 --- /dev/null +++ b/END0516/Code/main.py @@ -0,0 +1,268 @@ +import pandas as pd + +import DocSimElmoPlus0516 +import DotProSim +import PatSimBert0516 +import handler + + +def calculate_similarity(doctors): + # 统计医生名称的出现次数 + name_counts = {} + for doctor in doctors: + name = doctor['医生名称'] + if name in name_counts: + name_counts[name] += 1 + else: + name_counts[name] = 1 + + # 选出大于2次的医生名称 + selected_names = [name for name, count in name_counts.items() if count > 1] + + # 计算医生对应的相似度 + similarity_scores = {} + for doctor in doctors: + name = doctor['医生名称'] + score = doctor['相似度得分'] + if name in selected_names: + if doctor['医生集'] == 'A': + score *= 0.6 + elif doctor['医生集'] == 'B': + score *= 0.3 + elif doctor['医生集'] == 'C': + score *= 0.1 + if name in similarity_scores: + similarity_scores[name] += score + else: + similarity_scores[name] = score + + # 按相似度降序排序 + sorted_results = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True) + + # 仅保留前十个医生 + return sorted_results[:10] + # return sorted_results + + +def calculate_precision(candidates, test_data): + total_tests = len(test_data) + matched_tests = 0 + + for i in range(total_tests): + test = test_data[i] + for doctor in test: + for candidate in candidates[i]: + if doctor == candidate[0]: + matched_tests += 1 + break + + precision = matched_tests / total_tests if total_tests > 0 else 0 + return precision + +def calculate_average_precision(candidate_set, test_set): + total_precision = 0.0 + relevant_count = 0 + + for i in range(len(test_set)): + query = test_set[i] + candidates = candidate_set[i] + position = 0 + + for j in range(len(candidates)): + position += 1 + if query in candidates[j]: + precision = 1.0 / position + total_precision += precision + relevant_count += 1 + break + + if relevant_count > 0: + return total_precision / relevant_count + else: + return 0.0 + + +def calculate_coverage(candidate_sets): + total_coverage = 0.0 + candidate_count = 0 + + for candidates in candidate_sets: + distinct_doctors = set(candidate['医生名称'] for candidate in candidates) + coverage = len(distinct_doctors) / 229 + total_coverage += coverage + candidate_count += 1 + + if candidate_count > 0: + return total_coverage / candidate_count + else: + return 0.0 + + +def main(): + # 测试集 + # queries = [ + # "不拉不尿,吃啥吐啥,喝啥也吐。", + # "晚餐不消化,腹胀还放屁" + # ] + # 读取Excel文件 + df = pd.read_excel('../data/导医测试数据.xlsx', sheet_name='Sheet1') + + # 提取"ask"列数据作为查询列表 + queries = df['ask'].tolist() + + results_s = [] # 存储所有医生推荐结果的数组 + results1_s = [] # 存储所有医生推荐前结果的数组 + + for query in queries: + ''' + 相似问题 + ''' + print(f"咨询问题:{query}") + # output_values 前50个相似咨询 + output_values = PatSimBert0516.process_data(query) + + # 存储不同的sentence文本的集合 + unique_sentences = set() + # 遍历输出值列表并进行后续操作 + for output_struct in output_values: + # 排除与查询值相同的数据 + if output_struct.sentence == query: + continue + # 将不同的sentence文本添加到集合中 + unique_sentences.add(output_struct.sentence) + # 检查是否已经找到了五种不同的sentence文本 + if len(unique_sentences) == 5: + break + + # 存储与unique_sentences中的sentence相同的数据 + # matching_data 前5种相似咨询* + matching_data = [] + # 遍历输出值列表并进行后续操作 + for output_struct in output_values: + # 检查当前output_struct的sentence是否与unique_sentences中的值相同 + if output_struct.sentence in unique_sentences: + # 将匹配的数据添加到matching_data列表中 + matching_data.append(output_struct) + + results = [] # 存储某一个医生推荐结果的数组 + + print("========================医生集A=============================") + for dataA in matching_data: + print(f"医生集A: name: {dataA.name}, Score: {dataA.score:.4f}") + result = { + '医生集': 'A', + '医生名称': dataA.name, + '相似度得分': dataA.score + } + results.append(result) + + # 找出已采纳的idx + min_indices = handler.find_minimum_idx(matching_data) + print(f"已采纳的回答idx: {min_indices}") + + ''' + 相似回答 + ''' + target_indices = [x - 1 for x in min_indices] + # 日期 + output_path = '0516' + cosine_similarity = DocSimElmoPlus0516.calculate_cosine_similarity(target_indices, output_path) + + # 输出 + results.append(cosine_similarity) + + ''' + 相似医生 + ''' + doctor_idxs = [] + for target_indice in min_indices: + doctor_idx = handler.find_doctor_id_by_idx(target_indice) + doctor_idxs.append(doctor_idx) + similar_doctors = DotProSim.find_top_similar_doctors(doctor_idxs) + # 打印结果 + print("================3333333==============") + for doctor in similar_doctors: + idx = doctor['idx'] + # print(f"医生 {idx} 的相似医生:") + for similar_idx, similarity in doctor['similarities']: + name_by_idx = handler.find_doctor_name_by_idx(similar_idx) + name_doctor = name_by_idx[0] + # print(f"医生集C: name: {name_by_idx}, Score: {similarity:.4f}") + result = { + '医生集': 'C', + '医生名称': name_doctor, + '相似度得分': similarity + } + results.append(result) + print() + + # results_1,未处理候选医生集 + def flatten_list(lst): + flattened = [] + for item in lst: + if isinstance(item, list): + flattened.extend(flatten_list(item)) + else: + flattened.append(item) + return flattened + + results_1 = flatten_list(results) + results1_s.append(results_1) + + print("----------自动化0624----------") + # 输出结果数组 + for result in results_1: + print(f"医生集: {result['医生集']}, 医生名称: {result['医生名称']}, 相似度得分: {result['相似度得分']}") + + ''' + 1.处理候选医生集 + 数组中相同的医生数量>2进入计算 + A 0.6,B 0.3,C 0.1 = 候选集医生,排名(医生,相似度) + 2. 计算指标 + 2.1 精准度 + 判断测试数据的的医生是否在候选医生集中,在1不在0,得出测试集的精准度 + 2.2 平均精准度 + 实际有的医生,每一个医生的位置1/位置,平均精度相加*(1/n) + 2.3 覆盖率 + ABC集合中的医生涉及到的医生数目m,m/229 + ''' + # results_2,输出获选医生结果 + results_2 = calculate_similarity(results_1) + for result in results_2: + name = result[0] + similarity = result[1] + print(f"医生名称: {name}, 相似度: {similarity}") + + print("-----end0625------") + results_s.append(results_2) + print("-----end0625------") + + with open('results_s.txt', 'w', encoding='utf-8') as file: + for result in results_s: + result_str = ' '.join(str(item) for item in result) + file.write(result_str + '\n') + + # 测试集 + excel_file = '../data/导医测试数据.xlsx' + # sheet_name = 'Sheet1' + sheet_name = 'Sheet1' + df = pd.read_excel(excel_file, sheet_name=sheet_name) + test_data = df['doctor'].tolist() + questions = df['ask'].tolist() + test_data1 = [[doctor] for doctor in test_data] + + # 精准度 + precision = calculate_precision(results_s, test_data1) + print(f"测试集的准确率: {precision:.2%}") + + # 平均精准度 + average_precision = calculate_average_precision(results_s, test_data) + print(f"平均精准度: {average_precision:.2%}") + + # 覆盖率 + coverage = calculate_coverage(results1_s) + print("医生覆盖率: {:.2%}".format(coverage)) + + +if __name__ == "__main__": + main() -- 2.22.0