From f9515dfa4c90fa26cdec9d2ff4a35719651af293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E7=9B=9F?= <1127928805@qq.com> Date: Tue, 4 Mar 2025 05:20:05 +0000 Subject: [PATCH] Upload New File --- q25_14_ins_int8.py | 203 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 q25_14_ins_int8.py diff --git a/q25_14_ins_int8.py b/q25_14_ins_int8.py new file mode 100644 index 0000000..36fa309 --- /dev/null +++ b/q25_14_ins_int8.py @@ -0,0 +1,203 @@ +import os +import re +import torch +import json +# import time +# import pynvml +from tqdm import tqdm +import torch.distributed as dist +from vllm import LLM, SamplingParams +os.environ["NCCL_P2P_DISABLE"] = "1" + +torch.cuda.empty_cache() +os.environ["TRANSFORMERS_OFFLINE"] = "1" +os.environ["HF_DATASETS_OFFLINE"] = "1" +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" +texts_path = "/home/limeng/NLP/LLM/data/jcza/record_text2.txt" +model_path = "/home/limeng/NLP/LLM/model/Qwen2.5-14B-Instruct-GPTQ-Int8" +output_path = "/home/limeng/NLP/LLM/code/0220/result" + +llm = LLM(model=model_path, dtype="half",gpu_memory_utilization=0.9) +sampling_params = SamplingParams(temperature=0.3, top_p=0.3, max_tokens = 1500) + + +with open(texts_path, "r", encoding="utf-8") as file: + for i,line in enumerate(file): + medical_record = line.strip() # 处理每一行,去除首尾空白字符 + prompt = f""" +你是一个专业的医疗信息抽取助手。请从以下病历数据中严格按照#### 示例 JSON 提取字段信息,并确保: +1. **字段必须完整**,与示例 JSON 结构完全一致,不可缺失/新增/改动任何字段 +2. **重点字段精确**: + - 手术史需抽提所有手术名称+时间(格式:"手术名称": "时间") + - 化疗方案需按"方案名": {{时间+具体用药列表}} 格式提取 + - 放疗方案需按"方案描述": {{时间+次数+剂量}} 格式提取 +3. **严格空值处理**: + - 字符串字段填"无" + - 列表/字典字段填空列表[]/空字典{{}} + - 嵌套结构需保持完整(如肿瘤患病史必须含"肿瘤类型"/"肿瘤结局"字段) +4. **严格JSON格式**: + - 保持与示例完全相同的缩进/标点格式 + - 确保所有括号闭合,逗号正确 + - 生成第一个完整JSON后立即停止,禁止解释性文字 + +#### 病历数据 +{medical_record} + +#### 示例 JSON +{{ + "消瘦": "有", + "呕吐": "有", + "恶心": "有", + "腹部肿块": "有", + "腹胀": "有", + "腹痛": "有", + "里急后重": "有", + "腹泻": "有", + "大便形状改变": "有", + "排便困难": "有", + "黑便": "有", + "便血": "有", + "大便习惯和性状改变": "有", + "肠梗阻": "有", + "肠穿孔": "有", + "手术史": {{ + "直肠癌经腹前切除+末端回肠造口术": "2017-01-11日", + "冠脉支架置入术":" 2019年", + }}, + "肝转移": "有", + "肺转移": "有", + "腹膜转移": "有", + "骨转移": "有", + "远处转移": "有", + "锁骨上转移": "有", + "腹股沟转移": "有", + "腹膜后淋巴结转移": "有", + "其他远处淋巴转移": "无", + "化疗方案": {{ + "2011-1-12行XELOX方案": + {{"时间": "2011-1-12" + "具体用药": [ + "卡培他宾1.5g 2/日d1-14", + "奥沙利铂200mg d1" + ] + }}, + "2022-7-17、8-9行FOLFOX+西妥昔单抗方案": + {{ "时间": "2022-7-17、8-9" + "具体用药": [ + "奥沙利铂140mg 静滴D1", + "亚叶酸钙0.6g 静滴D1", + "5-FU 0.4g 静滴D1", + "5-FU 4.0g 化疗泵入44h", + "西妥昔单抗800mg 静滴D1" + ]}} + }}, + "放疗方案": + {{ + "盆腔复发灶": {{ + "时间": "2023-04-20至2023-05-30", + "次数": "25", + "单次剂量":"2Gy", + "总剂量":"50Gy" + }}, + "照射方法为适形调强放疗IMRT,分割方法为常规分割,疗效评估为PR": {{ + "时间": "2021年11月15日开始,2021年12月24日结束", + "次数": "25", + "单次剂量":"2Gy", + "总剂量":"" + }}, + }}, + "仍需治疗的其他疾病情况": ["高血压", "糖尿病"], + "入院前仍在服用的治疗药物": ["硝苯地平缓释片", "达格列净", "格列齐特"], + "高血压史": "有", + "伤寒史": "有", + "结核史": "有", + "病毒性肝炎史": "有", + "糖尿病史": "有", + "冠心病史": "有", + "冠脉支架放置": "有", + "脑卒中史": "有", + "其他非肿瘤疾病": [ + "慢性乙型病毒性肝炎", + ], + "肿瘤患病史": {{ + "肿瘤类型": "左肺腺癌", + "肿瘤结局": "治愈" + }} + "吸烟史": {{ + "吸烟年数": "25年", + "日吸烟量": "10支/天", + "是否戒烟": "已戒烟15年" + }}, + "饮酒史": "有" + "婚育史": {{ + "是否已婚": "已婚", + "是否已育": "已育", + "已育数量": "1女" + }}, + "结直肠癌家族史": {{ + "遗传性结直肠癌类型": "无", + "亲属类型": "弟弟", + "其他遗传性肿瘤": "结肠癌" + }}, + "体温": "36.0℃", + "呼吸": "18次/分", + "心率": "80次/分", + "血压": "120/80mmHg", + "BMI": "19.6" + "直肠指诊": {{ + "直肠指诊姿势": "膝胸位", + "直肠指诊是否触及肿块": "有", + "直肠指诊肿块下缘到肛缘距离": "5cm", + "直肠指诊肿块下缘到齿状线距离": "无", + "直肠指诊肿块活动度": "尚可", + "直肠指诊指套推出是否染血": "有" + }}, + "贫血貌": "有", + "巩膜黄染": "有", + "锁骨上淋巴结肿大": "有", + "腹壁静脉曲张": "有", + "肠形": "有", + "腹部压痛": "有", +}} +""" + try: + # 调用模型生成结果 + outputs = llm.generate(prompt, sampling_params) + model_output = outputs[0].outputs[0].text + print(model_output) + # 尝试提取 JSON 部分 + json_str = re.search(r'```json\n(.*?)\n```', model_output, re.DOTALL) + if json_str: + json_str = json_str.group(1) + result_json = json.loads(json_str) + else: + # 如果未找到 JSON 部分,直接保存原始文本 + json_str = re.search(r'#### JSON 提取结果\n(.*?)\n#### JSON 提取结果', model_output, re.DOTALL) + if json_str: + result_json = json.loads(json_str.group(1)) + else: + # 如果未找到 #### JSON 提取结果 格式,尝试提取 #### JSON 输出 格式 + json_str = re.search(r'#### JSON 输出\n(.*?)\n#### JSON 输出', model_output, re.DOTALL) + if json_str: + result_json = json.loads(json_str.group(1)) + else: + json_str = re.search(r'#### JSON 提取结果\n(.*?) 根据提', model_output, re.DOTALL) + if json_str: + result_json = json.loads(json_str.group(1)) + else: + result_json = {"error": "Invalid model output format", "original_text": medical_record, "model_output": model_output} + + # 保存结果到 JSON 文件 + file_name = f"long_txt{i}.json" + file_path = os.path.join(output_path, file_name) + with open(file_path, 'w', encoding='utf-8') as f: + json.dump({"text_record": medical_record, "extracted_info": result_json}, f, ensure_ascii=False, indent=4) + + except Exception as e: + # 捕获异常并记录错误 + print(f"Error processing record {i}: {e}") + # 保存原始文本 + file_name = f"long_txt{i}_error.json" + file_path = os.path.join(output_path, file_name) + with open(file_path, 'w', encoding='utf-8') as f: + json.dump({"text_record": medical_record, "error": str(e)}, f, ensure_ascii=False, indent=4) \ No newline at end of file -- 2.22.0