Commit 176a2f4c authored by lichusong's avatar lichusong

0716-END0516文件放SED模型(sentence bert elmo ...)

parent e03f09ee
import pandas as pd
import DocSimElmoPlus0516
import DotProSim
import PatSimBert0516
import handler
def calculate_similarity(doctors):
# 统计医生名称的出现次数
name_counts = {}
for doctor in doctors:
name = doctor['医生名称']
if name in name_counts:
name_counts[name] += 1
else:
name_counts[name] = 1
# 选出大于2次的医生名称
selected_names = [name for name, count in name_counts.items() if count > 1]
# 计算医生对应的相似度
similarity_scores = {}
for doctor in doctors:
name = doctor['医生名称']
score = doctor['相似度得分']
if name in selected_names:
if doctor['医生集'] == 'A':
score *= 0.6
elif doctor['医生集'] == 'B':
score *= 0.3
elif doctor['医生集'] == 'C':
score *= 0.1
if name in similarity_scores:
similarity_scores[name] += score
else:
similarity_scores[name] = score
# 按相似度降序排序
sorted_results = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
# 仅保留前十个医生
return sorted_results[:10]
# return sorted_results
def calculate_precision(candidates, test_data):
total_tests = len(test_data)
matched_tests = 0
for i in range(total_tests):
test = test_data[i]
for doctor in test:
for candidate in candidates[i]:
if doctor == candidate[0]:
matched_tests += 1
break
precision = matched_tests / total_tests if total_tests > 0 else 0
return precision
def calculate_average_precision(candidate_set, test_set):
total_precision = 0.0
relevant_count = 0
for i in range(len(test_set)):
query = test_set[i]
candidates = candidate_set[i]
position = 0
for j in range(len(candidates)):
position += 1
if query in candidates[j]:
precision = 1.0 / position
total_precision += precision
relevant_count += 1
break
if relevant_count > 0:
return total_precision / relevant_count
else:
return 0.0
def calculate_coverage(candidate_sets):
total_coverage = 0.0
candidate_count = 0
for candidates in candidate_sets:
distinct_doctors = set(candidate['医生名称'] for candidate in candidates)
coverage = len(distinct_doctors) / 229
total_coverage += coverage
candidate_count += 1
if candidate_count > 0:
return total_coverage / candidate_count
else:
return 0.0
def main():
# 测试集
# queries = [
# "不拉不尿,吃啥吐啥,喝啥也吐。",
# "晚餐不消化,腹胀还放屁"
# ]
# 读取Excel文件
df = pd.read_excel('../data/导医测试数据.xlsx', sheet_name='Sheet1')
# 提取"ask"列数据作为查询列表
queries = df['ask'].tolist()
results_s = [] # 存储所有医生推荐结果的数组
results1_s = [] # 存储所有医生推荐前结果的数组
for query in queries:
'''
相似问题
'''
print(f"咨询问题:{query}")
# output_values 前50个相似咨询
output_values = PatSimBert0516.process_data(query)
# 存储不同的sentence文本的集合
unique_sentences = set()
# 遍历输出值列表并进行后续操作
for output_struct in output_values:
# 排除与查询值相同的数据
if output_struct.sentence == query:
continue
# 将不同的sentence文本添加到集合中
unique_sentences.add(output_struct.sentence)
# 检查是否已经找到了五种不同的sentence文本
if len(unique_sentences) == 5:
break
# 存储与unique_sentences中的sentence相同的数据
# matching_data 前5种相似咨询*
matching_data = []
# 遍历输出值列表并进行后续操作
for output_struct in output_values:
# 检查当前output_struct的sentence是否与unique_sentences中的值相同
if output_struct.sentence in unique_sentences:
# 将匹配的数据添加到matching_data列表中
matching_data.append(output_struct)
results = [] # 存储某一个医生推荐结果的数组
print("========================医生集A=============================")
for dataA in matching_data:
print(f"医生集A: name: {dataA.name}, Score: {dataA.score:.4f}")
result = {
'医生集': 'A',
'医生名称': dataA.name,
'相似度得分': dataA.score
}
results.append(result)
# 找出已采纳的idx
min_indices = handler.find_minimum_idx(matching_data)
print(f"已采纳的回答idx: {min_indices}")
'''
相似回答
'''
target_indices = [x - 1 for x in min_indices]
# 日期
output_path = '0516'
cosine_similarity = DocSimElmoPlus0516.calculate_cosine_similarity(target_indices, output_path)
# 输出
results.append(cosine_similarity)
'''
相似医生
'''
doctor_idxs = []
for target_indice in min_indices:
doctor_idx = handler.find_doctor_id_by_idx(target_indice)
doctor_idxs.append(doctor_idx)
similar_doctors = DotProSim.find_top_similar_doctors(doctor_idxs)
# 打印结果
print("================3333333==============")
for doctor in similar_doctors:
idx = doctor['idx']
# print(f"医生 {idx} 的相似医生:")
for similar_idx, similarity in doctor['similarities']:
name_by_idx = handler.find_doctor_name_by_idx(similar_idx)
name_doctor = name_by_idx[0]
# print(f"医生集C: name: {name_by_idx}, Score: {similarity:.4f}")
result = {
'医生集': 'C',
'医生名称': name_doctor,
'相似度得分': similarity
}
results.append(result)
print()
# results_1,未处理候选医生集
def flatten_list(lst):
flattened = []
for item in lst:
if isinstance(item, list):
flattened.extend(flatten_list(item))
else:
flattened.append(item)
return flattened
results_1 = flatten_list(results)
results1_s.append(results_1)
print("----------自动化0624----------")
# 输出结果数组
for result in results_1:
print(f"医生集: {result['医生集']}, 医生名称: {result['医生名称']}, 相似度得分: {result['相似度得分']}")
'''
1.处理候选医生集
数组中相同的医生数量>2进入计算
A 0.6,B 0.3,C 0.1 = 候选集医生,排名(医生,相似度)
2. 计算指标
2.1 精准度
判断测试数据的的医生是否在候选医生集中,在1不在0,得出测试集的精准度
2.2 平均精准度
实际有的医生,每一个医生的位置1/位置,平均精度相加*(1/n)
2.3 覆盖率
ABC集合中的医生涉及到的医生数目m,m/229
'''
# results_2,输出获选医生结果
results_2 = calculate_similarity(results_1)
for result in results_2:
name = result[0]
similarity = result[1]
print(f"医生名称: {name}, 相似度: {similarity}")
print("-----end0625------")
results_s.append(results_2)
print("-----end0625------")
with open('results_s.txt', 'w', encoding='utf-8') as file:
for result in results_s:
result_str = ' '.join(str(item) for item in result)
file.write(result_str + '\n')
# 测试集
excel_file = '../data/导医测试数据.xlsx'
# sheet_name = 'Sheet1'
sheet_name = 'Sheet1'
df = pd.read_excel(excel_file, sheet_name=sheet_name)
test_data = df['doctor'].tolist()
questions = df['ask'].tolist()
test_data1 = [[doctor] for doctor in test_data]
# 精准度
precision = calculate_precision(results_s, test_data1)
print(f"测试集的准确率: {precision:.2%}")
# 平均精准度
average_precision = calculate_average_precision(results_s, test_data)
print(f"平均精准度: {average_precision:.2%}")
# 覆盖率
coverage = calculate_coverage(results1_s)
print("医生覆盖率: {:.2%}".format(coverage))
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment