Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
LLM_extraction
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
李盟
LLM_extraction
Commits
82bfda58
Commit
82bfda58
authored
Mar 04, 2025
by
李盟
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Delete q25_14_ins_int8.py
parent
79e2f0e6
Pipeline
#507
canceled with stages
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
203 deletions
+0
-203
q25_14_ins_int8.py
q25_14_ins_int8.py
+0
-203
No files found.
q25_14_ins_int8.py
deleted
100644 → 0
View file @
79e2f0e6
import
os
import
re
import
torch
import
json
# import time
# import pynvml
from
tqdm
import
tqdm
import
torch.distributed
as
dist
from
vllm
import
LLM
,
SamplingParams
os
.
environ
[
"NCCL_P2P_DISABLE"
]
=
"1"
torch
.
cuda
.
empty_cache
()
os
.
environ
[
"TRANSFORMERS_OFFLINE"
]
=
"1"
os
.
environ
[
"HF_DATASETS_OFFLINE"
]
=
"1"
os
.
environ
[
"PYTORCH_CUDA_ALLOC_CONF"
]
=
"expandable_segments:True"
texts_path
=
"/home/limeng/NLP/LLM/data/jcza/record_text2.txt"
model_path
=
"/home/limeng/NLP/LLM/model/Qwen2.5-14B-Instruct-GPTQ-Int8"
output_path
=
"/home/limeng/NLP/LLM/code/0220/result"
llm
=
LLM
(
model
=
model_path
,
dtype
=
"half"
,
gpu_memory_utilization
=
0.9
)
sampling_params
=
SamplingParams
(
temperature
=
0.3
,
top_p
=
0.3
,
max_tokens
=
1500
)
with
open
(
texts_path
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
for
i
,
line
in
enumerate
(
file
):
medical_record
=
line
.
strip
()
# 处理每一行,去除首尾空白字符
prompt
=
f
"""
你是一个专业的医疗信息抽取助手。请从以下病历数据中严格按照#### 示例 JSON 提取字段信息,并确保:
1. **字段必须完整**,与示例 JSON 结构完全一致,不可缺失/新增/改动任何字段
2. **重点字段精确**:
- 手术史需抽提所有手术名称+时间(格式:"手术名称": "时间")
- 化疗方案需按"方案名": {{时间+具体用药列表}} 格式提取
- 放疗方案需按"方案描述": {{时间+次数+剂量}} 格式提取
3. **严格空值处理**:
- 字符串字段填"无"
- 列表/字典字段填空列表[]/空字典{{}}
- 嵌套结构需保持完整(如肿瘤患病史必须含"肿瘤类型"/"肿瘤结局"字段)
4. **严格JSON格式**:
- 保持与示例完全相同的缩进/标点格式
- 确保所有括号闭合,逗号正确
- 生成第一个完整JSON后立即停止,禁止解释性文字
#### 病历数据
{medical_record}
#### 示例 JSON
{{
"消瘦": "有",
"呕吐": "有",
"恶心": "有",
"腹部肿块": "有",
"腹胀": "有",
"腹痛": "有",
"里急后重": "有",
"腹泻": "有",
"大便形状改变": "有",
"排便困难": "有",
"黑便": "有",
"便血": "有",
"大便习惯和性状改变": "有",
"肠梗阻": "有",
"肠穿孔": "有",
"手术史": {{
"直肠癌经腹前切除+末端回肠造口术": "2017-01-11日",
"冠脉支架置入术":" 2019年",
}},
"肝转移": "有",
"肺转移": "有",
"腹膜转移": "有",
"骨转移": "有",
"远处转移": "有",
"锁骨上转移": "有",
"腹股沟转移": "有",
"腹膜后淋巴结转移": "有",
"其他远处淋巴转移": "无",
"化疗方案": {{
"2011-1-12行XELOX方案":
{{"时间": "2011-1-12"
"具体用药": [
"卡培他宾1.5g 2/日d1-14",
"奥沙利铂200mg d1"
]
}},
"2022-7-17、8-9行FOLFOX+西妥昔单抗方案":
{{ "时间": "2022-7-17、8-9"
"具体用药": [
"奥沙利铂140mg 静滴D1",
"亚叶酸钙0.6g 静滴D1",
"5-FU 0.4g 静滴D1",
"5-FU 4.0g 化疗泵入44h",
"西妥昔单抗800mg 静滴D1"
]}}
}},
"放疗方案":
{{
"盆腔复发灶": {{
"时间": "2023-04-20至2023-05-30",
"次数": "25",
"单次剂量":"2Gy",
"总剂量":"50Gy"
}},
"照射方法为适形调强放疗IMRT,分割方法为常规分割,疗效评估为PR": {{
"时间": "2021年11月15日开始,2021年12月24日结束",
"次数": "25",
"单次剂量":"2Gy",
"总剂量":""
}},
}},
"仍需治疗的其他疾病情况": ["高血压", "糖尿病"],
"入院前仍在服用的治疗药物": ["硝苯地平缓释片", "达格列净", "格列齐特"],
"高血压史": "有",
"伤寒史": "有",
"结核史": "有",
"病毒性肝炎史": "有",
"糖尿病史": "有",
"冠心病史": "有",
"冠脉支架放置": "有",
"脑卒中史": "有",
"其他非肿瘤疾病": [
"慢性乙型病毒性肝炎",
],
"肿瘤患病史": {{
"肿瘤类型": "左肺腺癌",
"肿瘤结局": "治愈"
}}
"吸烟史": {{
"吸烟年数": "25年",
"日吸烟量": "10支/天",
"是否戒烟": "已戒烟15年"
}},
"饮酒史": "有"
"婚育史": {{
"是否已婚": "已婚",
"是否已育": "已育",
"已育数量": "1女"
}},
"结直肠癌家族史": {{
"遗传性结直肠癌类型": "无",
"亲属类型": "弟弟",
"其他遗传性肿瘤": "结肠癌"
}},
"体温": "36.0℃",
"呼吸": "18次/分",
"心率": "80次/分",
"血压": "120/80mmHg",
"BMI": "19.6"
"直肠指诊": {{
"直肠指诊姿势": "膝胸位",
"直肠指诊是否触及肿块": "有",
"直肠指诊肿块下缘到肛缘距离": "5cm",
"直肠指诊肿块下缘到齿状线距离": "无",
"直肠指诊肿块活动度": "尚可",
"直肠指诊指套推出是否染血": "有"
}},
"贫血貌": "有",
"巩膜黄染": "有",
"锁骨上淋巴结肿大": "有",
"腹壁静脉曲张": "有",
"肠形": "有",
"腹部压痛": "有",
}}
"""
try
:
# 调用模型生成结果
outputs
=
llm
.
generate
(
prompt
,
sampling_params
)
model_output
=
outputs
[
0
]
.
outputs
[
0
]
.
text
print
(
model_output
)
# 尝试提取 JSON 部分
json_str
=
re
.
search
(
r'```json\n(.*?)\n```'
,
model_output
,
re
.
DOTALL
)
if
json_str
:
json_str
=
json_str
.
group
(
1
)
result_json
=
json
.
loads
(
json_str
)
else
:
# 如果未找到 JSON 部分,直接保存原始文本
json_str
=
re
.
search
(
r'#### JSON 提取结果\n(.*?)\n#### JSON 提取结果'
,
model_output
,
re
.
DOTALL
)
if
json_str
:
result_json
=
json
.
loads
(
json_str
.
group
(
1
))
else
:
# 如果未找到 #### JSON 提取结果 格式,尝试提取 #### JSON 输出 格式
json_str
=
re
.
search
(
r'#### JSON 输出\n(.*?)\n#### JSON 输出'
,
model_output
,
re
.
DOTALL
)
if
json_str
:
result_json
=
json
.
loads
(
json_str
.
group
(
1
))
else
:
json_str
=
re
.
search
(
r'#### JSON 提取结果\n(.*?) 根据提'
,
model_output
,
re
.
DOTALL
)
if
json_str
:
result_json
=
json
.
loads
(
json_str
.
group
(
1
))
else
:
result_json
=
{
"error"
:
"Invalid model output format"
,
"original_text"
:
medical_record
,
"model_output"
:
model_output
}
# 保存结果到 JSON 文件
file_name
=
f
"long_txt{i}.json"
file_path
=
os
.
path
.
join
(
output_path
,
file_name
)
with
open
(
file_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
({
"text_record"
:
medical_record
,
"extracted_info"
:
result_json
},
f
,
ensure_ascii
=
False
,
indent
=
4
)
except
Exception
as
e
:
# 捕获异常并记录错误
print
(
f
"Error processing record {i}: {e}"
)
# 保存原始文本
file_name
=
f
"long_txt{i}_error.json"
file_path
=
os
.
path
.
join
(
output_path
,
file_name
)
with
open
(
file_path
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
({
"text_record"
:
medical_record
,
"error"
:
str
(
e
)},
f
,
ensure_ascii
=
False
,
indent
=
4
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment