Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
文
文本导医模型
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
李矗松
文本导医模型
Commits
176a2f4c
Commit
176a2f4c
authored
Jul 17, 2023
by
lichusong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
0716-END0516文件放SED模型(sentence bert elmo ...)
parent
e03f09ee
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
268 additions
and
0 deletions
+268
-0
main.py
END0516/Code/main.py
+268
-0
No files found.
END0516/Code/main.py
0 → 100644
View file @
176a2f4c
import
pandas
as
pd
import
DocSimElmoPlus0516
import
DotProSim
import
PatSimBert0516
import
handler
def
calculate_similarity
(
doctors
):
# 统计医生名称的出现次数
name_counts
=
{}
for
doctor
in
doctors
:
name
=
doctor
[
'医生名称'
]
if
name
in
name_counts
:
name_counts
[
name
]
+=
1
else
:
name_counts
[
name
]
=
1
# 选出大于2次的医生名称
selected_names
=
[
name
for
name
,
count
in
name_counts
.
items
()
if
count
>
1
]
# 计算医生对应的相似度
similarity_scores
=
{}
for
doctor
in
doctors
:
name
=
doctor
[
'医生名称'
]
score
=
doctor
[
'相似度得分'
]
if
name
in
selected_names
:
if
doctor
[
'医生集'
]
==
'A'
:
score
*=
0.6
elif
doctor
[
'医生集'
]
==
'B'
:
score
*=
0.3
elif
doctor
[
'医生集'
]
==
'C'
:
score
*=
0.1
if
name
in
similarity_scores
:
similarity_scores
[
name
]
+=
score
else
:
similarity_scores
[
name
]
=
score
# 按相似度降序排序
sorted_results
=
sorted
(
similarity_scores
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
# 仅保留前十个医生
return
sorted_results
[:
10
]
# return sorted_results
def
calculate_precision
(
candidates
,
test_data
):
total_tests
=
len
(
test_data
)
matched_tests
=
0
for
i
in
range
(
total_tests
):
test
=
test_data
[
i
]
for
doctor
in
test
:
for
candidate
in
candidates
[
i
]:
if
doctor
==
candidate
[
0
]:
matched_tests
+=
1
break
precision
=
matched_tests
/
total_tests
if
total_tests
>
0
else
0
return
precision
def
calculate_average_precision
(
candidate_set
,
test_set
):
total_precision
=
0.0
relevant_count
=
0
for
i
in
range
(
len
(
test_set
)):
query
=
test_set
[
i
]
candidates
=
candidate_set
[
i
]
position
=
0
for
j
in
range
(
len
(
candidates
)):
position
+=
1
if
query
in
candidates
[
j
]:
precision
=
1.0
/
position
total_precision
+=
precision
relevant_count
+=
1
break
if
relevant_count
>
0
:
return
total_precision
/
relevant_count
else
:
return
0.0
def
calculate_coverage
(
candidate_sets
):
total_coverage
=
0.0
candidate_count
=
0
for
candidates
in
candidate_sets
:
distinct_doctors
=
set
(
candidate
[
'医生名称'
]
for
candidate
in
candidates
)
coverage
=
len
(
distinct_doctors
)
/
229
total_coverage
+=
coverage
candidate_count
+=
1
if
candidate_count
>
0
:
return
total_coverage
/
candidate_count
else
:
return
0.0
def
main
():
# 测试集
# queries = [
# "不拉不尿,吃啥吐啥,喝啥也吐。",
# "晚餐不消化,腹胀还放屁"
# ]
# 读取Excel文件
df
=
pd
.
read_excel
(
'../data/导医测试数据.xlsx'
,
sheet_name
=
'Sheet1'
)
# 提取"ask"列数据作为查询列表
queries
=
df
[
'ask'
]
.
tolist
()
results_s
=
[]
# 存储所有医生推荐结果的数组
results1_s
=
[]
# 存储所有医生推荐前结果的数组
for
query
in
queries
:
'''
相似问题
'''
print
(
f
"咨询问题:{query}"
)
# output_values 前50个相似咨询
output_values
=
PatSimBert0516
.
process_data
(
query
)
# 存储不同的sentence文本的集合
unique_sentences
=
set
()
# 遍历输出值列表并进行后续操作
for
output_struct
in
output_values
:
# 排除与查询值相同的数据
if
output_struct
.
sentence
==
query
:
continue
# 将不同的sentence文本添加到集合中
unique_sentences
.
add
(
output_struct
.
sentence
)
# 检查是否已经找到了五种不同的sentence文本
if
len
(
unique_sentences
)
==
5
:
break
# 存储与unique_sentences中的sentence相同的数据
# matching_data 前5种相似咨询*
matching_data
=
[]
# 遍历输出值列表并进行后续操作
for
output_struct
in
output_values
:
# 检查当前output_struct的sentence是否与unique_sentences中的值相同
if
output_struct
.
sentence
in
unique_sentences
:
# 将匹配的数据添加到matching_data列表中
matching_data
.
append
(
output_struct
)
results
=
[]
# 存储某一个医生推荐结果的数组
print
(
"========================医生集A============================="
)
for
dataA
in
matching_data
:
print
(
f
"医生集A: name: {dataA.name}, Score: {dataA.score:.4f}"
)
result
=
{
'医生集'
:
'A'
,
'医生名称'
:
dataA
.
name
,
'相似度得分'
:
dataA
.
score
}
results
.
append
(
result
)
# 找出已采纳的idx
min_indices
=
handler
.
find_minimum_idx
(
matching_data
)
print
(
f
"已采纳的回答idx: {min_indices}"
)
'''
相似回答
'''
target_indices
=
[
x
-
1
for
x
in
min_indices
]
# 日期
output_path
=
'0516'
cosine_similarity
=
DocSimElmoPlus0516
.
calculate_cosine_similarity
(
target_indices
,
output_path
)
# 输出
results
.
append
(
cosine_similarity
)
'''
相似医生
'''
doctor_idxs
=
[]
for
target_indice
in
min_indices
:
doctor_idx
=
handler
.
find_doctor_id_by_idx
(
target_indice
)
doctor_idxs
.
append
(
doctor_idx
)
similar_doctors
=
DotProSim
.
find_top_similar_doctors
(
doctor_idxs
)
# 打印结果
print
(
"================3333333=============="
)
for
doctor
in
similar_doctors
:
idx
=
doctor
[
'idx'
]
# print(f"医生 {idx} 的相似医生:")
for
similar_idx
,
similarity
in
doctor
[
'similarities'
]:
name_by_idx
=
handler
.
find_doctor_name_by_idx
(
similar_idx
)
name_doctor
=
name_by_idx
[
0
]
# print(f"医生集C: name: {name_by_idx}, Score: {similarity:.4f}")
result
=
{
'医生集'
:
'C'
,
'医生名称'
:
name_doctor
,
'相似度得分'
:
similarity
}
results
.
append
(
result
)
print
()
# results_1,未处理候选医生集
def
flatten_list
(
lst
):
flattened
=
[]
for
item
in
lst
:
if
isinstance
(
item
,
list
):
flattened
.
extend
(
flatten_list
(
item
))
else
:
flattened
.
append
(
item
)
return
flattened
results_1
=
flatten_list
(
results
)
results1_s
.
append
(
results_1
)
print
(
"----------自动化0624----------"
)
# 输出结果数组
for
result
in
results_1
:
print
(
f
"医生集: {result['医生集']}, 医生名称: {result['医生名称']}, 相似度得分: {result['相似度得分']}"
)
'''
1.处理候选医生集
数组中相同的医生数量>2进入计算
A 0.6,B 0.3,C 0.1 = 候选集医生,排名(医生,相似度)
2. 计算指标
2.1 精准度
判断测试数据的的医生是否在候选医生集中,在1不在0,得出测试集的精准度
2.2 平均精准度
实际有的医生,每一个医生的位置1/位置,平均精度相加*(1/n)
2.3 覆盖率
ABC集合中的医生涉及到的医生数目m,m/229
'''
# results_2,输出获选医生结果
results_2
=
calculate_similarity
(
results_1
)
for
result
in
results_2
:
name
=
result
[
0
]
similarity
=
result
[
1
]
print
(
f
"医生名称: {name}, 相似度: {similarity}"
)
print
(
"-----end0625------"
)
results_s
.
append
(
results_2
)
print
(
"-----end0625------"
)
with
open
(
'results_s.txt'
,
'w'
,
encoding
=
'utf-8'
)
as
file
:
for
result
in
results_s
:
result_str
=
' '
.
join
(
str
(
item
)
for
item
in
result
)
file
.
write
(
result_str
+
'
\n
'
)
# 测试集
excel_file
=
'../data/导医测试数据.xlsx'
# sheet_name = 'Sheet1'
sheet_name
=
'Sheet1'
df
=
pd
.
read_excel
(
excel_file
,
sheet_name
=
sheet_name
)
test_data
=
df
[
'doctor'
]
.
tolist
()
questions
=
df
[
'ask'
]
.
tolist
()
test_data1
=
[[
doctor
]
for
doctor
in
test_data
]
# 精准度
precision
=
calculate_precision
(
results_s
,
test_data1
)
print
(
f
"测试集的准确率: {precision:.2
%
}"
)
# 平均精准度
average_precision
=
calculate_average_precision
(
results_s
,
test_data
)
print
(
f
"平均精准度: {average_precision:.2
%
}"
)
# 覆盖率
coverage
=
calculate_coverage
(
results1_s
)
print
(
"医生覆盖率: {:.2
%
}"
.
format
(
coverage
))
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment