diff --git a/q25_14_ins_int8.py b/q25_14_ins_int8.py new file mode 100644 index 0000000000000000000000000000000000000000..36fa3098625c767c2cfa873122f1789852e1f85d --- /dev/null +++ b/q25_14_ins_int8.py @@ -0,0 +1,203 @@ +import os +import re +import torch +import json +# import time +# import pynvml +from tqdm import tqdm +import torch.distributed as dist +from vllm import LLM, SamplingParams +os.environ["NCCL_P2P_DISABLE"] = "1" + +torch.cuda.empty_cache() +os.environ["TRANSFORMERS_OFFLINE"] = "1" +os.environ["HF_DATASETS_OFFLINE"] = "1" +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" +texts_path = "/home/limeng/NLP/LLM/data/jcza/record_text2.txt" +model_path = "/home/limeng/NLP/LLM/model/Qwen2.5-14B-Instruct-GPTQ-Int8" +output_path = "/home/limeng/NLP/LLM/code/0220/result" + +llm = LLM(model=model_path, dtype="half",gpu_memory_utilization=0.9) +sampling_params = SamplingParams(temperature=0.3, top_p=0.3, max_tokens = 1500) + + +with open(texts_path, "r", encoding="utf-8") as file: + for i,line in enumerate(file): + medical_record = line.strip() # å¤„ç†æ¯ä¸€è¡Œï¼ŒåŽ»é™¤é¦–å°¾ç©ºç™½å—符 + prompt = f""" +ä½ æ˜¯ä¸€ä¸ªä¸“ä¸šçš„åŒ»ç–—ä¿¡æ¯æŠ½å–助手。请从以下病历数æ®ä¸ä¸¥æ ¼æŒ‰ç…§#### 示例 JSON æå–å—æ®µä¿¡æ¯ï¼Œå¹¶ç¡®ä¿ï¼š +1. **å—æ®µå¿…须完整**,与示例 JSON 结构完全一致,ä¸å¯ç¼ºå¤±/新增/æ”¹åŠ¨ä»»ä½•å—æ®µ +2. **é‡ç‚¹å—段精确**: + - 手术å²éœ€æŠ½ææ‰€æœ‰æ‰‹æœ¯åç§°+æ—¶é—´ï¼ˆæ ¼å¼ï¼š"手术åç§°": "æ—¶é—´") + - 化疗方案需按"方案å": {{æ—¶é—´+具体用è¯åˆ—表}} æ ¼å¼æå– + - 放疗方案需按"方案æè¿°": {{æ—¶é—´+次数+剂é‡}} æ ¼å¼æå– +3. **ä¸¥æ ¼ç©ºå€¼å¤„ç†**: + - å—ç¬¦ä¸²å—æ®µå¡«"æ— " + - 列表/å—兏嗿®µå¡«ç©ºåˆ—表[]/空å—å…¸{{}} + - åµŒå¥—ç»“æž„éœ€ä¿æŒå®Œæ•´ï¼ˆå¦‚肿瘤患病å²å¿…é¡»å«"肿瘤类型"/"肿瘤结局"å—æ®µï¼‰ +4. **ä¸¥æ ¼JSONæ ¼å¼**: + - ä¿æŒä¸Žç¤ºä¾‹å®Œå…¨ç›¸åŒçš„缩进/æ ‡ç‚¹æ ¼å¼ + - ç¡®ä¿æ‰€æœ‰æ‹¬å·é—åˆï¼Œé€—å·æ£ç¡® + - 生æˆç¬¬ä¸€ä¸ªå®Œæ•´JSONåŽç«‹å³åœæ¢ï¼Œç¦æ¢è§£é‡Šæ€§æ–‡å— + +#### ç—…åŽ†æ•°æ® +{medical_record} + +#### 示例 JSON +{{ + "消瘦": "有", + "å‘•å": "有", + "æ¶å¿ƒ": "有", + "腹部肿å—": "有", + "腹胀": "有", + "腹痛": "有", + "里急åŽé‡": "有", + "腹泻": "有", + "大便形状改å˜": "有", + "排便困难": "有", + "黑便": "有", + "便血": "有", + "å¤§ä¾¿ä¹ æƒ¯å’Œæ€§çŠ¶æ”¹å˜": "有", + "è‚ æ¢—é˜»": "有", + "è‚ ç©¿å”": "有", + "手术å²": {{ + "ç›´è‚ ç™Œç»è…¹å‰åˆ‡é™¤+æœ«ç«¯å›žè‚ é€ å£æœ¯": "2017-01-11æ—¥", + "å† è„‰æ”¯æž¶ç½®å…¥æœ¯":" 2019å¹´", + }}, + "è‚转移": "有", + "肺转移": "有", + "腹膜转移": "有", + "骨转移": "有", + "远处转移": "有", + "é”骨上转移": "有", + "腹股沟转移": "有", + "è…¹è†œåŽæ·‹å·´ç»“转移": "有", + "其他远处淋巴转移": "æ— ", + "化疗方案": {{ + "2011-1-12行XELOX方案": + {{"æ—¶é—´": "2011-1-12" + "具体用è¯": [ + "å¡åŸ¹ä»–宾1.5g 2/æ—¥d1-14", + "奥沙利铂200mg d1" + ] + }}, + "2022-7-17ã€8-9行FOLFOX+è¥¿å¦¥æ˜”å•æŠ—æ–¹æ¡ˆ": + {{ "æ—¶é—´": "2022-7-17ã€8-9" + "具体用è¯": [ + "奥沙利铂140mg 陿»´D1", + "亚å¶é…¸é’™0.6g 陿»´D1", + "5-FU 0.4g 陿»´D1", + "5-FU 4.0g 化疗泵入44h", + "è¥¿å¦¥æ˜”å•æŠ—800mg 陿»´D1" + ]}} + }}, + "放疗方案": + {{ + "盆腔å¤å‘ç¶": {{ + "æ—¶é—´": "2023-04-20至2023-05-30", + "次数": "25", + "啿¬¡å‰‚é‡":"2Gy", + "总剂é‡":"50Gy" + }}, + "照射方法为适形调强放疗IMRT,分割方法为常规分割,疗效评估为PR": {{ + "æ—¶é—´": "2021å¹´11月15日开始,2021å¹´12月24日结æŸ", + "次数": "25", + "啿¬¡å‰‚é‡":"2Gy", + "总剂é‡":"" + }}, + }}, + "ä»éœ€æ²»ç–—的其他疾病情况": ["高血压", "ç³–å°¿ç—…"], + "入院å‰ä»åœ¨æœç”¨çš„æ²»ç–—è¯ç‰©": ["ç¡è‹¯åœ°å¹³ç¼“释片", "è¾¾æ ¼åˆ—å‡€", "æ ¼åˆ—é½ç‰¹"], + "高血压å²": "有", + "伤寒å²": "有", + "ç»“æ ¸å²": "有", + "病毒性è‚炎å²": "有", + "ç³–å°¿ç—…å²": "有", + "å† å¿ƒç—…å²": "有", + "å† è„‰æ”¯æž¶æ”¾ç½®": "有", + "è„‘å’ä¸å²": "有", + "å…¶ä»–éžè‚¿ç˜¤ç–¾ç—…": [ + "慢性乙型病毒性è‚炎", + ], + "肿瘤患病å²": {{ + "肿瘤类型": "左肺腺癌", + "肿瘤结局": "治愈" + }} + "å¸çƒŸå²": {{ + "å¸çƒŸå¹´æ•°": "25å¹´", + "æ—¥å¸çƒŸé‡": "10支/天", + "æ˜¯å¦æˆ’烟": "已戒烟15å¹´" + }}, + "饮酒å²": "有" + "婚育å²": {{ + "是å¦å·²å©š": "已婚", + "是å¦å·²è‚²": "已育", + "已育数é‡": "1女" + }}, + "ç»“ç›´è‚ ç™Œå®¶æ—å²": {{ + "é—ä¼ æ€§ç»“ç›´è‚ ç™Œç±»åž‹": "æ— ", + "亲属类型": "弟弟", + "å…¶ä»–é—ä¼ æ€§è‚¿ç˜¤": "ç»“è‚ ç™Œ" + }}, + "体温": "36.0℃", + "呼å¸": "18次/分", + "心率": "80次/分", + "血压": "120/80mmHg", + "BMI": "19.6" + "ç›´è‚ æŒ‡è¯Š": {{ + "ç›´è‚ æŒ‡è¯Šå§¿åŠ¿": "è†èƒ¸ä½", + "ç›´è‚ æŒ‡è¯Šæ˜¯å¦è§¦åŠè‚¿å—": "有", + "ç›´è‚ æŒ‡è¯Šè‚¿å—下缘到肛缘è·ç¦»": "5cm", + "ç›´è‚ æŒ‡è¯Šè‚¿å—下缘到齿状线è·ç¦»": "æ— ", + "ç›´è‚ æŒ‡è¯Šè‚¿å—æ´»åŠ¨åº¦": "å°šå¯", + "ç›´è‚ æŒ‡è¯ŠæŒ‡å¥—æŽ¨å‡ºæ˜¯å¦æŸ“è¡€": "有" + }}, + "贫血貌": "有", + "巩膜黄染": "有", + "é”骨上淋巴结肿大": "有", + "è…¹å£é™è„‰æ›²å¼ ": "有", + "è‚ å½¢": "有", + "腹部压痛": "有", +}} +""" + try: + # 调用模型生æˆç»“æžœ + outputs = llm.generate(prompt, sampling_params) + model_output = outputs[0].outputs[0].text + print(model_output) + # å°è¯•æå– JSON 部分 + json_str = re.search(r'```json\n(.*?)\n```', model_output, re.DOTALL) + if json_str: + json_str = json_str.group(1) + result_json = json.loads(json_str) + else: + # 如果未找到 JSON 部分,直接ä¿å˜åŽŸå§‹æ–‡æœ¬ + json_str = re.search(r'#### JSON æå–结果\n(.*?)\n#### JSON æå–结果', model_output, re.DOTALL) + if json_str: + result_json = json.loads(json_str.group(1)) + else: + # 如果未找到 #### JSON æå–结果 æ ¼å¼ï¼Œå°è¯•æå– #### JSON 输出 æ ¼å¼ + json_str = re.search(r'#### JSON 输出\n(.*?)\n#### JSON 输出', model_output, re.DOTALL) + if json_str: + result_json = json.loads(json_str.group(1)) + else: + json_str = re.search(r'#### JSON æå–结果\n(.*?) æ ¹æ®æ', model_output, re.DOTALL) + if json_str: + result_json = json.loads(json_str.group(1)) + else: + result_json = {"error": "Invalid model output format", "original_text": medical_record, "model_output": model_output} + + # ä¿å˜ç»“果到 JSON 文件 + file_name = f"long_txt{i}.json" + file_path = os.path.join(output_path, file_name) + with open(file_path, 'w', encoding='utf-8') as f: + json.dump({"text_record": medical_record, "extracted_info": result_json}, f, ensure_ascii=False, indent=4) + + except Exception as e: + # æ•获异常并记录错误 + print(f"Error processing record {i}: {e}") + # ä¿å˜åŽŸå§‹æ–‡æœ¬ + file_name = f"long_txt{i}_error.json" + file_path = os.path.join(output_path, file_name) + with open(file_path, 'w', encoding='utf-8') as f: + json.dump({"text_record": medical_record, "error": str(e)}, f, ensure_ascii=False, indent=4) \ No newline at end of file