Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
L
LLM_extraction
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
李盟
LLM_extraction
Commits
f9515dfa
You need to sign in or sign up before continuing.
Commit
f9515dfa
authored
Mar 04, 2025
by
李盟
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
4e5d81bd
Pipeline
#504
canceled with stages
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
203 additions
and
0 deletions
+203
-0
q25_14_ins_int8.py
q25_14_ins_int8.py
+203
-0
No files found.
q25_14_ins_int8.py
0 → 100644
View file @
f9515dfa
import
os
import
re
import
torch
import
json
# import time
# import pynvml
from
tqdm
import
tqdm
import
torch.distributed
as
dist
from
vllm
import
LLM
,
SamplingParams
os
.
environ
[
"NCCL_P2P_DISABLE"
]
=
"1"
torch
.
cuda
.
empty_cache
()
os
.
environ
[
"TRANSFORMERS_OFFLINE"
]
=
"1"
os
.
environ
[
"HF_DATASETS_OFFLINE"
]
=
"1"
os
.
environ
[
"PYTORCH_CUDA_ALLOC_CONF"
]
=
"expandable_segments:True"
texts_path
=
"/home/limeng/NLP/LLM/data/jcza/record_text2.txt"
model_path
=
"/home/limeng/NLP/LLM/model/Qwen2.5-14B-Instruct-GPTQ-Int8"
output_path
=
"/home/limeng/NLP/LLM/code/0220/result"
llm
=
LLM
(
model
=
model_path
,
dtype
=
"half"
,
gpu_memory_utilization
=
0.9
)
sampling_params
=
SamplingParams
(
temperature
=
0.3
,
top_p
=
0.3
,
max_tokens
=
1500
)
with
open
(
texts_path
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
for
i
,
line
in
enumerate
(
file
):
medical_record
=
line
.
strip
()
# 处理每一行,去除首尾空白字符
prompt
=
f
"""
你是一个专业的医疗信息抽取助手。请从以下病历数据中严格按照#### 示例 JSON 提取字段信息,并确保:
1. **字段必须完整**,与示例 JSON 结构完全一致,不可缺失/新增/改动任何字段
2. **重点字段精确**:
- 手术史需抽提所有手术名称+时间(格式:"手术名称": "时间")
- 化疗方案需按"方案名": {{时间+具体用药列表}} 格式提取
- 放疗方案需按"方案描述": {{时间+次数+剂量}} 格式提取
3. **严格空值处理**:
- 字符串字段填"无"
- 列表/字典字段填空列表[]/空字典{{}}
- 嵌套结构需保持完整(如肿瘤患病史必须含"肿瘤类型"/"肿瘤结局"字段)
4. **严格JSON格式**:
- 保持与示例完全相同的缩进/标点格式
- 确保所有括号闭合,逗号正确
- 生成第一个完整JSON后立即停止,禁止解释性文字
#### 病历数据
{medical_record}
#### 示例 JSON
{{
"消瘦": "有",
"呕吐": "有",
"恶心": "有",
"腹部肿块": "有",
"腹胀": "有",
"腹痛": "有",
"里急后重": "有",
"腹泻": "有",
"大便形状改变": "有",
"排便困难": "有",
"黑便": "有",
"便血": "有",
"大便习惯和性状改变": "有",
"肠梗阻": "有",
"肠穿孔": "有",
"手术史": {{
"直肠癌经腹前切除+末端回肠造口术": "2017-01-11日",
"冠脉支架置入术":" 2019年",
}},
"肝转移": "有",
"肺转移": "有",
"腹膜转移": "有",
"骨转移": "有",
"远处转移": "有",
"锁骨上转移": "有",
"腹股沟转移": "有",
"腹膜后淋巴结转移": "有",
"其他远处淋巴转移": "无",
"化疗方案": {{
"2011-1-12行XELOX方案":
{{"时间": "2011-1-12"
"具体用药": [
"卡培他宾1.5g 2/日d1-14",
"奥沙利铂200mg d1"
]
}},
"2022-7-17、8-9行FOLFOX+西妥昔单抗方案":
{{ "时间": "2022-7-17、8-9"
"具体用药": [
"奥沙利铂140mg 静滴D1",
"亚叶酸钙0.6g 静滴D1",
"5-FU 0.4g 静滴D1",
"5-FU 4.0g 化疗泵入44h",
"西妥昔单抗800mg 静滴D1"
]}}
}},
"放疗方案":
{{
"盆腔复发灶": {{
"时间": "2023-04-20至2023-05-30",
"次数": "25",
"单次剂量":"2Gy",
"总剂量":"50Gy"
}},
"照射方法为适形调强放疗IMRT,分割方法为常规分割,疗效评估为PR": {{
"时间": "2021年11月15日开始,2021年12月24日结束",
"次数": "25",
"单次剂量":"2Gy",
"总剂量":""
}},
}},
"仍需治疗的其他疾病情况": ["高血压", "糖尿病"],
"入院前仍在服用的治疗药物": ["硝苯地平缓释片", "达格列净", "格列齐特"],
"高血压史": "有",
"伤寒史": "有",
"结核史": "有",
"病毒性肝炎史": "有",
"糖尿病史": "有",
"冠心病史": "有",
"冠脉支架放置": "有",
"脑卒中史": "有",
"其他非肿瘤疾病": [
"慢性乙型病毒性肝炎",
],
"肿瘤患病史": {{
"肿瘤类型": "左肺腺癌",
"肿瘤结局": "治愈"
}}
"吸烟史": {{
"吸烟年数": "25年",
"日吸烟量": "10支/天",
"是否戒烟": "已戒烟15年"
}},
"饮酒史": "有"
"婚育史": {{
"是否已婚": "已婚",
"是否已育": "已育",
"已育数量": "1女"
}},
"结直肠癌家族史": {{
"遗传性结直肠癌类型": "无",
"亲属类型": "弟弟",
"其他遗传性肿瘤": "结肠癌"
}},
"体温": "36.0℃",
"呼吸": "18次/分",
"心率": "80次/分",
"血压": "120/80mmHg",
"BMI": "19.6"
"直肠指诊": {{
"直肠指诊姿势": "膝胸位",
"直肠指诊是否触及肿块": "有",
"直肠指诊肿块下缘到肛缘距离": "5cm",
"直肠指诊肿块下缘到齿状线距离": "无",
"直肠指诊肿块活动度": "尚可",
"直肠指诊指套推出是否染血": "有"
}},
"贫血貌": "有",
"巩膜黄染": "有",
"锁骨上淋巴结肿大": "有",
"腹壁静脉曲张": "有",
"肠形": "有",
"腹部压痛": "有",
}}
"""
try
:
# 调用模型生成结果
outputs
=
llm
.
generate
(
prompt
,
sampling_params
)
model_output
=
outputs
[
0
]
.
outputs
[
0
]
.
text
print
(
model_output
)
# 尝试提取 JSON 部分
json_str
=
re
.
search
(
r'```json\n(.*?)\n```'
,
model_output
,
re
.
DOTALL
)
if
json_str
:
json_str
=
json_str
.
group
(
1
)
result_json
=
json
.
loads
(
json_str
)
else
:
# 如果未找到 JSON 部分,直接保存原始文本
json_str
=
re
.
search
(
r'#### JSON 提取结果\n(.*?)\n#### JSON 提取结果'
,
model_output
,
re
.
DOTALL
)
if
json_str
:
result_json
=
json
.
loads
(
json_str
.
group
(
1
))
else
:
# 如果未找到 #### JSON 提取结果 格式,尝试提取 #### JSON 输出 格式
json_str
=
re
.
search
(
r'#### JSON 输出\n(.*?)\n#### JSON 输出'
,
model_output
,
re
.
DOTALL
)
if
json_str
:
result_json
=
json
.
loads
(
json_str
.
group
(
1
))
else
:
json_str
=
re
.
search
(
r'#### JSON 提取结果\n(.*?) 根据提'
,
model_output
,
re
.
DOTALL
)
if
json_str
:
result_json
=
json
.
loads
(
json_str
.
group
(
1
))
else
:
result_json
=
{
"error"
:
"Invalid model output format"
,
"original_text"
:
medical_record
,
"model_output"
:
model_output
}
# 保存结果到 JSON 文件
file_name
=
f
"long_txt{i}.json"
file_path
=
os
.
path
.
join
(
output_path
,
file_name
)
with
open
(
file_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
({
"text_record"
:
medical_record
,
"extracted_info"
:
result_json
},
f
,
ensure_ascii
=
False
,
indent
=
4
)
except
Exception
as
e
:
# 捕获异常并记录错误
print
(
f
"Error processing record {i}: {e}"
)
# 保存原始文本
file_name
=
f
"long_txt{i}_error.json"
file_path
=
os
.
path
.
join
(
output_path
,
file_name
)
with
open
(
file_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
({
"text_record"
:
medical_record
,
"error"
:
str
(
e
)},
f
,
ensure_ascii
=
False
,
indent
=
4
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment