From de2f24da4bf2ff915b2cfa14ed4ace6965ffcb49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E7=9B=9F?= <1127928805@qq.com> Date: Thu, 27 Mar 2025 09:47:59 +0000 Subject: [PATCH] Upload New File --- xslx2csv.ipynb | 3591 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3591 insertions(+) create mode 100644 xslx2csv.ipynb diff --git a/xslx2csv.ipynb b/xslx2csv.ipynb new file mode 100644 index 0000000..b130007 --- /dev/null +++ b/xslx2csv.ipynb @@ -0,0 +1,3591 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
节点名表名节点类型宽度是否必传注释说明
0COC_HBZL_RYJL$kh入院记录字符32Y卡号患者就诊卡卡号
1COC_HBZL_RYJL$klx入院记录字符16Y卡类型参见字典表
2COC_HBZL_RYJL$xgbz入院记录字符1Y修改标志1:正常 2:修改3:撤销
3COC_HBZL_RYJL$yjlxh入院记录字符32Y原纪录序号院内唯一标识
4COC_HBZL_RYJL$hzbh入院记录varchar64Y患者编号NaN
........................
1137COC_HBZL_SFJL$hgnpg随访记录varchar500NaN喉功能评估NaN
1138COC_HBZL_SFJL$zhzlsj随访记录varchar20NaN综合治疗时间【天/月/年】后
1139COC_HBZL_SFJL$zhzlfa随访记录varchar500NaN综合治疗方案NaN
1140COC_HBZL_SFJL$sxz随访记录varchar50NaN书写者NaN
1141COC_HBZL_SFJL$cjsj随访记录varchar20NaN创建时间NaN
\n", + "

1142 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " 节点名 表名 节点类型 宽度 是否必传 注释 说明\n", + "0 COC_HBZL_RYJL$kh 入院记录 字符 32 Y 卡号 患者就诊卡卡号\n", + "1 COC_HBZL_RYJL$klx 入院记录 字符 16 Y 卡类型 参见字典表\n", + "2 COC_HBZL_RYJL$xgbz 入院记录 字符 1 Y 修改标志 1:正常 2:修改3:撤销\n", + "3 COC_HBZL_RYJL$yjlxh 入院记录 字符 32 Y 原纪录序号 院内唯一标识\n", + "4 COC_HBZL_RYJL$hzbh 入院记录 varchar 64 Y 患者编号 NaN\n", + "... ... ... ... ... ... ... ...\n", + "1137 COC_HBZL_SFJL$hgnpg 随访记录 varchar 500 NaN 喉功能评估 NaN\n", + "1138 COC_HBZL_SFJL$zhzlsj 随访记录 varchar 20 NaN 综合治疗时间 【天/月/年】后\n", + "1139 COC_HBZL_SFJL$zhzlfa 随访记录 varchar 500 NaN 综合治疗方案 NaN\n", + "1140 COC_HBZL_SFJL$sxz 随访记录 varchar 50 NaN 书写者 NaN\n", + "1141 COC_HBZL_SFJL$cjsj 随访记录 varchar 20 NaN 创建时间 NaN\n", + "\n", + "[1142 rows x 7 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_excel('/home/limeng/SICT/lung_test/数据采集接口规范(喉癌).xlsx')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('regular.csv',index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "匹配结果统计:\n", + "未匹配 127\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import difflib\n", + "\n", + "def get_best_match(target, choices):\n", + " \"\"\"使用difflib找到最佳匹配\"\"\"\n", + " matches = difflib.get_close_matches(target, choices, n=1, cutoff=0.6)\n", + " return matches[0] if matches else None\n", + "\n", + "# 读取规范文件\n", + "regular_df = pd.read_csv('/home/limeng/SICT/lung_test/regular.csv')\n", + "\n", + "# 读取测试数据\n", + "test_df = pd.read_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例.csv', encoding='ISO-8859-1')\n", + "\n", + "# 创建规范字典,键为注释,值为对应的规则\n", + "regular_dict = dict(zip(regular_df['注释'], regular_df.to_dict('records')))\n", + "\n", + "# 创建新的注释列\n", + "matched_annotations = []\n", + "for _, row in test_df.iterrows():\n", + " # 组合三个字段\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " \n", + " # 在规范中查找最佳匹配\n", + " best_match = get_best_match(combined_field, regular_dict.keys())\n", + " matched_annotations.append(best_match if best_match else \"未匹配\")\n", + "\n", + "# 获取ValueItemKind列的位置\n", + "kind_idx = test_df.columns.get_loc('ValueItemKind')\n", + "\n", + "# 在ValueItemKind列前插入新的注释列\n", + "test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n", + "\n", + "# 保存结果\n", + "test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations.csv', index=False)\n", + "\n", + "# 打印匹配结果统计\n", + "print(\"\\n匹配结果统计:\")\n", + "print(pd.Series(matched_annotations).value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "\n", + "匹配结果统计:\n", + "未匹配 123\n", + "现病史-精神状态 1\n", + "体格检查-精神状态 1\n", + "体格检查-呼吸 1\n", + "体格检查-查体 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import difflib\n", + "import chardet\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " def get_best_match(target, choices):\n", + " \"\"\"使用difflib找到最佳匹配\"\"\"\n", + " matches = difflib.get_close_matches(target, choices, n=1, cutoff=0.6)\n", + " return matches[0] if matches else None\n", + "\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_dict = dict(zip(regular_df['注释'], regular_df.to_dict('records')))\n", + "\n", + " # 创建新的注释列\n", + " matched_annotations = []\n", + " for _, row in test_df.iterrows():\n", + " # 组合三个字段\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " \n", + " # 在规范中查找最佳匹配\n", + " best_match = get_best_match(combined_field, regular_dict.keys())\n", + " matched_annotations.append(best_match if best_match else \"未匹配\")\n", + "\n", + " # 获取ValueItemKind列的位置\n", + " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", + "\n", + " # 在ValueItemKind列前插入新的注释列\n", + " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n", + "\n", + " # 保存结果\n", + " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations.csv', \n", + " index=False, encoding=test_encoding)\n", + "\n", + " # 打印匹配结果统计\n", + " print(\"\\n匹配结果统计:\")\n", + " print(pd.Series(matched_annotations).value_counts())\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "加载Sentence Transformer模型...\n", + "计算规范注释的嵌入向量...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 36/36 [00:01<00:00, 24.81it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "开始匹配注释...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 4/4 [00:00<00:00, 113.98it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "匹配结果统计:\n", + "未匹配 14\n", + "手术类型 10\n", + "皮肤黏膜-皮下出血 9\n", + "辅助检查-乙肝病毒-医院名称 9\n", + "患者基本情况 5\n", + "主任医师签名日期 5\n", + "手术医师签名 5\n", + "下一步治疗方案-具体方案 5\n", + "发起科室参与人员-主任医师 5\n", + "一般情况-主要症状及体征-姓名 5\n", + "一般情况-发育 3\n", + "中断放疗-是否 3\n", + "个人史-疫源接触史-接触时间 3\n", + "甲状腺-左侧甲状腺包块-有无 3\n", + "第一助手 2\n", + "第二助手 2\n", + "出院时情况 2\n", + "以下血管结构可见肿瘤包饶-颈总动脉 2\n", + "现病史-精神状态 2\n", + "入院时情况-主诉 2\n", + "病理报告-检查日期 2\n", + "入院时情况-患者年龄 2\n", + "讨论经过-病理科-医师姓名 2\n", + "系统回顾-运动骨骼系统-关节肿痛-时长 2\n", + "辅助检查-心电图-检查结论 2\n", + "颈部-颈部气管切开-硅胶气管筒 2\n", + "既往史-过敏史-药物食物过敏源 1\n", + "一般情况-神志 1\n", + "现病史-大便 1\n", + "系统回顾-泌尿系统-排尿困难 1\n", + "系统回顾-神经精神系统-癫痫 1\n", + "脊柱四肢-关节活动 1\n", + "咽-喉咽-喉咽后壁新生物-形态 1\n", + "讨论经过-病理科-病理结果 1\n", + "环后区-其他描述 1\n", + "系统回顾-泌尿系统-排尿困难-服用药物 1\n", + "系统回顾-血液系统-鼻衄史-目前清理 1\n", + "记录医师签名日期 1\n", + "一般情况-主要症状及体征-主诉 1\n", + "喉部增强CT-CT号 1\n", + "术前常规化验-化验日期 1\n", + "主治医师签名 1\n", + "主任医师签名 1\n", + "月经史-月经周期 1\n", + "术前常规化验-化验单位 1\n", + "Name: count, dtype: int64\n", + "\n", + "前5行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "匹配注释: 讨论经过-病理科-医师姓名\n", + "相似度分数: 0.8398\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "匹配注释: 辅助检查-乙肝病毒-医院名称\n", + "相似度分数: 0.8360\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "匹配注释: 入院时情况-患者年龄\n", + "相似度分数: 0.8390\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-主诉-主诉\n", + "匹配注释: 入院时情况-主诉\n", + "相似度分数: 0.9434\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-入院日期-入院日期\n", + "匹配注释: 病理报告-检查日期\n", + "相似度分数: 0.9574\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sentence_transformers import SentenceTransformer\n", + "import chardet\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 加载预训练的中文Sentence Transformer模型\n", + " print(\"加载Sentence Transformer模型...\")\n", + " model = SentenceTransformer('/home/limeng/SICT/lung_test/all-MiniLM-L6-v2') # 多语言模型,支持中文\n", + " \n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 计算规范注释的嵌入向量\n", + " print(\"计算规范注释的嵌入向量...\")\n", + " regular_embeddings = model.encode(regular_annotations, show_progress_bar=True)\n", + " \n", + " # 创建新的注释列\n", + " matched_annotations = []\n", + " matched_scores = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " # 批量处理测试数据中的字段组合\n", + " combined_fields = []\n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 计算测试数据的嵌入向量\n", + " test_embeddings = model.encode(combined_fields, show_progress_bar=True)\n", + " \n", + " # 计算相似度并找到最佳匹配\n", + " for i, test_embedding in enumerate(test_embeddings):\n", + " # 计算与所有规范注释的余弦相似度\n", + " similarities = np.dot(regular_embeddings, test_embedding) / (\n", + " np.linalg.norm(regular_embeddings, axis=1) * np.linalg.norm(test_embedding)\n", + " )\n", + " \n", + " # 找到最佳匹配\n", + " best_match_idx = np.argmax(similarities)\n", + " best_match_score = similarities[best_match_idx]\n", + " \n", + " # 如果相似度低于阈值,标记为未匹配\n", + " if best_match_score < 0.5: # 可以调整这个阈值\n", + " matched_annotations.append(\"未匹配\")\n", + " matched_scores.append(0.0)\n", + " else:\n", + " matched_annotations.append(regular_annotations[best_match_idx])\n", + " matched_scores.append(best_match_score)\n", + " \n", + " # 获取ValueItemKind列的位置\n", + " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", + " \n", + " # 在ValueItemKind列前插入新的注释列和相似度分数列\n", + " test_df.insert(kind_idx, 'Matched_Score', matched_scores)\n", + " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n", + " \n", + " # 保存结果\n", + " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_transformer.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " # 打印匹配结果统计\n", + " print(\"\\n匹配结果统计:\")\n", + " print(pd.Series(matched_annotations).value_counts())\n", + " \n", + " # 打印前5行匹配结果示例\n", + " print(\"\\n前5行匹配结果示例:\")\n", + " for i in range(min(5, len(test_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"匹配注释: {test_df.iloc[i]['Matched_Annotation']}\")\n", + " print(f\"相似度分数: {test_df.iloc[i]['Matched_Score']:.4f}\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Building prefix dict from the default dictionary ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Dumping model to file cache /tmp/jieba.cache\n", + "Loading model cost 0.792 seconds.\n", + "Prefix dict has been built successfully.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "处理第 100/127 条记录...\n", + "\n", + "匹配结果统计:\n", + "体格检查 20\n", + "未匹配 20\n", + "手术经过 13\n", + "姓名 11\n", + "主刀医师 10\n", + "现病史 7\n", + "性别 5\n", + "手术名称 5\n", + "小结时间 5\n", + "麻醉方式 5\n", + "Name: count, dtype: int64\n", + "\n", + "匹配方法使用统计:\n", + "Partial 100\n", + "None 20\n", + "Levenshtein 5\n", + "TF-IDF 2\n", + "Name: count, dtype: int64\n", + "\n", + "前5行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "匹配注释: 姓名\n", + "相似度分数: 1.0000\n", + "匹配方法: Partial\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "匹配注释: 姓名\n", + "相似度分数: 1.0000\n", + "匹配方法: Partial\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "匹配注释: 姓名\n", + "相似度分数: 1.0000\n", + "匹配方法: Partial\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-主诉-主诉\n", + "匹配注释: 主诉\n", + "相似度分数: 1.0000\n", + "匹配方法: Partial\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-入院日期-入院日期\n", + "匹配注释: 病理报告-检查日期\n", + "相似度分数: 0.6774\n", + "匹配方法: Levenshtein\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "import Levenshtein\n", + "from fuzzywuzzy import fuzz\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 定义多种相似度计算方法\n", + " def calculate_similarities(query, candidates):\n", + " \"\"\"计算多种相似度指标\"\"\"\n", + " results = []\n", + " \n", + " # 1. TF-IDF + 余弦相似度\n", + " try:\n", + " # 对中文文本进行分词\n", + " segmented_query = ' '.join(jieba.cut(query))\n", + " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", + " \n", + " # 计算TF-IDF向量\n", + " vectorizer = TfidfVectorizer()\n", + " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", + " \n", + " # 计算余弦相似度\n", + " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", + " \n", + " # 找到最佳匹配\n", + " best_idx_tfidf = np.argmax(cosine_sim)\n", + " best_score_tfidf = cosine_sim[best_idx_tfidf]\n", + " results.append((candidates[best_idx_tfidf], best_score_tfidf, \"TF-IDF\"))\n", + " except Exception as e:\n", + " print(f\"TF-IDF计算失败: {e}\")\n", + " \n", + " # 2. Levenshtein距离(编辑距离)\n", + " try:\n", + " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n", + " # 将距离转换为相似度分数(越小越相似)\n", + " max_len = max(len(query), max(len(c) for c in candidates))\n", + " lev_similarities = [1 - dist/max_len for dist in lev_distances]\n", + " \n", + " best_idx_lev = np.argmax(lev_similarities)\n", + " best_score_lev = lev_similarities[best_idx_lev]\n", + " results.append((candidates[best_idx_lev], best_score_lev, \"Levenshtein\"))\n", + " except Exception as e:\n", + " print(f\"Levenshtein计算失败: {e}\")\n", + " \n", + " # 3. FuzzyWuzzy比率\n", + " try:\n", + " fuzzy_ratios = [fuzz.ratio(query, c)/100 for c in candidates]\n", + " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n", + " best_score_fuzzy = fuzzy_ratios[best_idx_fuzzy]\n", + " results.append((candidates[best_idx_fuzzy], best_score_fuzzy, \"FuzzyWuzzy\"))\n", + " except Exception as e:\n", + " print(f\"FuzzyWuzzy计算失败: {e}\")\n", + " \n", + " # 4. FuzzyWuzzy部分比率(处理子字符串)\n", + " try:\n", + " partial_ratios = [fuzz.partial_ratio(query, c)/100 for c in candidates]\n", + " best_idx_partial = np.argmax(partial_ratios)\n", + " best_score_partial = partial_ratios[best_idx_partial]\n", + " results.append((candidates[best_idx_partial], best_score_partial, \"Partial\"))\n", + " except Exception as e:\n", + " print(f\"Partial比率计算失败: {e}\")\n", + " \n", + " # 5. FuzzyWuzzy令牌排序比率(处理词序不同)\n", + " try:\n", + " token_sort_ratios = [fuzz.token_sort_ratio(query, c)/100 for c in candidates]\n", + " best_idx_token = np.argmax(token_sort_ratios)\n", + " best_score_token = token_sort_ratios[best_idx_token]\n", + " results.append((candidates[best_idx_token], best_score_token, \"TokenSort\"))\n", + " except Exception as e:\n", + " print(f\"TokenSort比率计算失败: {e}\")\n", + " \n", + " # 找出所有方法中得分最高的结果\n", + " best_result = max(results, key=lambda x: x[1]) if results else (None, 0, None)\n", + " \n", + " return best_result\n", + " \n", + " # 对每个测试字段进行匹配\n", + " matched_annotations = []\n", + " matched_scores = []\n", + " matched_methods = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " for i, query in enumerate(combined_fields):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n", + " \n", + " # 计算多种相似度并选择最佳匹配\n", + " best_match, best_score, best_method = calculate_similarities(query, regular_annotations)\n", + " \n", + " # 如果相似度低于阈值,标记为未匹配\n", + " if best_score < 0.6: # 可以调整这个阈值\n", + " matched_annotations.append(\"未匹配\")\n", + " matched_scores.append(0.0)\n", + " matched_methods.append(\"None\")\n", + " else:\n", + " matched_annotations.append(best_match)\n", + " matched_scores.append(best_score)\n", + " matched_methods.append(best_method)\n", + " \n", + " # 获取ValueItemKind列的位置\n", + " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", + " \n", + " # 在ValueItemKind列前插入新的列\n", + " test_df.insert(kind_idx, 'Matched_Method', matched_methods)\n", + " test_df.insert(kind_idx, 'Matched_Score', matched_scores)\n", + " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n", + " \n", + " # 保存结果\n", + " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_multi.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " # 打印匹配结果统计\n", + " print(\"\\n匹配结果统计:\")\n", + " print(pd.Series(matched_annotations).value_counts().head(10))\n", + " \n", + " # 打印方法使用统计\n", + " print(\"\\n匹配方法使用统计:\")\n", + " print(pd.Series(matched_methods).value_counts())\n", + " \n", + " # 打印前5行匹配结果示例\n", + " print(\"\\n前5行匹配结果示例:\")\n", + " for i in range(min(5, len(test_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"匹配注释: {test_df.iloc[i]['Matched_Annotation']}\")\n", + " print(f\"相似度分数: {test_df.iloc[i]['Matched_Score']:.4f}\")\n", + " print(f\"匹配方法: {test_df.iloc[i]['Matched_Method']}\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n", + "处理第 100/127 条记录...\n", + "\n", + "各方法匹配结果统计:\n", + "TF-IDF匹配结果:\n", + "主刀医师 15\n", + "卡号 14\n", + "手术经过 13\n", + "既往史-手术外伤史-手术史-手术时间 10\n", + "患者姓名 7\n", + "患者基本情况 6\n", + "手术名称 5\n", + "麻醉方式 5\n", + "参与人员 4\n", + "主诉 3\n", + "Name: count, dtype: int64\n", + "\n", + "Levenshtein匹配结果:\n", + "既往史-手术外伤史-手术史-有无 13\n", + "体格检查-血压-收缩压 10\n", + "既往史-手术外伤史-手术史-手术时间 10\n", + "入院时情况-患者姓名 8\n", + "主要辅助检查-实验室检查-Na 8\n", + "主治医师签名 6\n", + "既往史-手术外伤史-手术史-手术名称 5\n", + "患者性别 5\n", + "主治医师签名时间 5\n", + "个人史-饮酒史-主要饮酒种类 5\n", + "Name: count, dtype: int64\n", + "\n", + "FuzzyWuzzy匹配结果:\n", + "讨论经过-耳鼻喉科/眼科-具体手术方案 13\n", + "患者姓名 11\n", + "既往史-手术外伤史-手术史-手术时间 10\n", + "体格检查-血压-收缩压 9\n", + "主要辅助检查-实验室检查-Na 8\n", + "主治医师签名 6\n", + "发起科室参与人员-主任医师 5\n", + "麻醉方式 5\n", + "患者性别 5\n", + "既往史-手术外伤史-手术史-手术名称 5\n", + "Name: count, dtype: int64\n", + "\n", + "前5行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "TF-IDF匹配: 患者姓名\n", + "Levenshtein匹配: 患者姓名\n", + "FuzzyWuzzy匹配: 患者姓名\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "TF-IDF匹配: 患者性别\n", + "Levenshtein匹配: 患者姓名\n", + "FuzzyWuzzy匹配: 患者姓名\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "TF-IDF匹配: 患者年龄\n", + "Levenshtein匹配: 患者姓名\n", + "FuzzyWuzzy匹配: 患者姓名\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-主诉-主诉\n", + "TF-IDF匹配: 主诉\n", + "Levenshtein匹配: 入院时情况-主诉\n", + "FuzzyWuzzy匹配: 一般情况-主要症状及体征-主诉\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-入院日期-入院日期\n", + "TF-IDF匹配: 入院时情况-入院时间\n", + "Levenshtein匹配: 病理报告-检查日期\n", + "FuzzyWuzzy匹配: 现病史-外院手术日期\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "import Levenshtein\n", + "from fuzzywuzzy import fuzz\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 定义多种相似度计算方法\n", + " def calculate_similarities(query, candidates):\n", + " \"\"\"计算多种相似度指标,返回每种方法的最佳匹配\"\"\"\n", + " results = {}\n", + " \n", + " # 1. TF-IDF + 余弦相似度\n", + " try:\n", + " # 对中文文本进行分词\n", + " segmented_query = ' '.join(jieba.cut(query))\n", + " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", + " \n", + " # 计算TF-IDF向量\n", + " vectorizer = TfidfVectorizer()\n", + " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", + " \n", + " # 计算余弦相似度\n", + " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", + " \n", + " # 找到最佳匹配\n", + " best_idx_tfidf = np.argmax(cosine_sim)\n", + " results['TF-IDF'] = candidates[best_idx_tfidf]\n", + " except Exception as e:\n", + " print(f\"TF-IDF计算失败: {e}\")\n", + " results['TF-IDF'] = \"未匹配\"\n", + " \n", + " # 2. Levenshtein距离(编辑距离)\n", + " try:\n", + " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n", + " # 将距离转换为相似度分数(越小越相似)\n", + " best_idx_lev = np.argmin(lev_distances)\n", + " results['Levenshtein'] = candidates[best_idx_lev]\n", + " except Exception as e:\n", + " print(f\"Levenshtein计算失败: {e}\")\n", + " results['Levenshtein'] = \"未匹配\"\n", + " \n", + " # 3. FuzzyWuzzy比率\n", + " try:\n", + " fuzzy_ratios = [fuzz.ratio(query, c) for c in candidates]\n", + " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n", + " results['FuzzyWuzzy'] = candidates[best_idx_fuzzy]\n", + " except Exception as e:\n", + " print(f\"FuzzyWuzzy计算失败: {e}\")\n", + " results['FuzzyWuzzy'] = \"未匹配\"\n", + " \n", + " # 4. FuzzyWuzzy部分比率(处理子字符串)\n", + " try:\n", + " partial_ratios = [fuzz.partial_ratio(query, c) for c in candidates]\n", + " best_idx_partial = np.argmax(partial_ratios)\n", + " results['Partial'] = candidates[best_idx_partial]\n", + " except Exception as e:\n", + " print(f\"Partial比率计算失败: {e}\")\n", + " results['Partial'] = \"未匹配\"\n", + " \n", + " # 5. FuzzyWuzzy令牌排序比率(处理词序不同)\n", + " try:\n", + " token_sort_ratios = [fuzz.token_sort_ratio(query, c) for c in candidates]\n", + " best_idx_token = np.argmax(token_sort_ratios)\n", + " results['TokenSort'] = candidates[best_idx_token]\n", + " except Exception as e:\n", + " print(f\"TokenSort比率计算失败: {e}\")\n", + " results['TokenSort'] = \"未匹配\"\n", + " \n", + " return results\n", + " \n", + " # 对每个测试字段进行匹配\n", + " tfidf_matches = []\n", + " levenshtein_matches = []\n", + " fuzzywuzzy_matches = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " for i, query in enumerate(combined_fields):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n", + " \n", + " # 计算多种相似度\n", + " matches = calculate_similarities(query, regular_annotations)\n", + " \n", + " # 保存各种方法的匹配结果\n", + " tfidf_matches.append(matches.get('TF-IDF', \"未匹配\"))\n", + " levenshtein_matches.append(matches.get('Levenshtein', \"未匹配\"))\n", + " fuzzywuzzy_matches.append(matches.get('FuzzyWuzzy', \"未匹配\"))\n", + " \n", + " # 获取ValueItemKind列的位置\n", + " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", + " \n", + " # 在ValueItemKind列前插入新的列\n", + " test_df.insert(kind_idx, 'FuzzyWuzzy_Match', fuzzywuzzy_matches)\n", + " test_df.insert(kind_idx, 'Levenshtein_Match', levenshtein_matches)\n", + " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n", + " \n", + " # 保存结果\n", + " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_all.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " # 打印匹配结果统计\n", + " print(\"\\n各方法匹配结果统计:\")\n", + " print(\"TF-IDF匹配结果:\")\n", + " print(pd.Series(tfidf_matches).value_counts().head(10))\n", + " print(\"\\nLevenshtein匹配结果:\")\n", + " print(pd.Series(levenshtein_matches).value_counts().head(10))\n", + " print(\"\\nFuzzyWuzzy匹配结果:\")\n", + " print(pd.Series(fuzzywuzzy_matches).value_counts().head(10))\n", + " \n", + " # 打印前5行匹配结果示例\n", + " print(\"\\n前5行匹配结果示例:\")\n", + " for i in range(min(5, len(test_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n", + " print(f\"Levenshtein匹配: {test_df.iloc[i]['Levenshtein_Match']}\")\n", + " print(f\"FuzzyWuzzy匹配: {test_df.iloc[i]['FuzzyWuzzy_Match']}\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m 17\u001b[0m test_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 18\u001b[0m regular_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/limeng/SICT/lung_test/data/regular.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 20\u001b[0m test_encoding \u001b[38;5;241m=\u001b[39m \u001b[43mdetect_encoding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtest_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m regular_encoding \u001b[38;5;241m=\u001b[39m detect_encoding(regular_file)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m测试文件编码: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_encoding\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "Cell \u001b[0;32mIn[1], line 12\u001b[0m, in \u001b[0;36mdetect_encoding\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdetect_encoding\u001b[39m(file_path):\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 13\u001b[0m result \u001b[38;5;241m=\u001b[39m chardet\u001b[38;5;241m.\u001b[39mdetect(f\u001b[38;5;241m.\u001b[39mread())\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", + "File \u001b[0;32m~/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 322\u001b[0m )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "import Levenshtein\n", + "from fuzzywuzzy import fuzz\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " paragraph_names = []\n", + " statement_names = []\n", + " value_item_names = []\n", + " \n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " paragraph_names.append(row['ParagraphName'])\n", + " statement_names.append(row['StatementName'])\n", + " value_item_names.append(row['ValueItemName'])\n", + " \n", + " # 定义多种相似度计算方法\n", + " def calculate_similarities(query, candidates):\n", + " \"\"\"计算多种相似度指标,返回每种方法的最佳匹配和分数\"\"\"\n", + " results = {}\n", + " scores = {}\n", + " \n", + " # 1. TF-IDF + 余弦相似度\n", + " try:\n", + " # 对中文文本进行分词\n", + " segmented_query = ' '.join(jieba.cut(query))\n", + " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", + " \n", + " # 计算TF-IDF向量\n", + " vectorizer = TfidfVectorizer()\n", + " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", + " \n", + " # 计算余弦相似度\n", + " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", + " \n", + " # 找到最佳匹配\n", + " best_idx_tfidf = np.argmax(cosine_sim)\n", + " results['TF-IDF'] = candidates[best_idx_tfidf]\n", + " scores['TF-IDF'] = cosine_sim[best_idx_tfidf]\n", + " except Exception as e:\n", + " print(f\"TF-IDF计算失败: {e}\")\n", + " results['TF-IDF'] = \"未匹配\"\n", + " scores['TF-IDF'] = 0.0\n", + " \n", + " # 2. Levenshtein距离(编辑距离)\n", + " try:\n", + " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n", + " # 将距离转换为相似度分数(越小越相似)\n", + " max_len = max(len(query), max(len(c) for c in candidates))\n", + " lev_similarities = [1 - dist/max_len for dist in lev_distances]\n", + " \n", + " best_idx_lev = np.argmax(lev_similarities)\n", + " results['Levenshtein'] = candidates[best_idx_lev]\n", + " scores['Levenshtein'] = lev_similarities[best_idx_lev]\n", + " except Exception as e:\n", + " print(f\"Levenshtein计算失败: {e}\")\n", + " results['Levenshtein'] = \"未匹配\"\n", + " scores['Levenshtein'] = 0.0\n", + " \n", + " # 3. FuzzyWuzzy比率\n", + " try:\n", + " fuzzy_ratios = [fuzz.ratio(query, c)/100 for c in candidates]\n", + " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n", + " results['FuzzyWuzzy'] = candidates[best_idx_fuzzy]\n", + " scores['FuzzyWuzzy'] = fuzzy_ratios[best_idx_fuzzy]\n", + " except Exception as e:\n", + " print(f\"FuzzyWuzzy计算失败: {e}\")\n", + " results['FuzzyWuzzy'] = \"未匹配\"\n", + " scores['FuzzyWuzzy'] = 0.0\n", + " \n", + " return results, scores\n", + " \n", + " # 对每个测试字段进行匹配\n", + " tfidf_matches = []\n", + " levenshtein_matches = []\n", + " fuzzywuzzy_matches = []\n", + " best_matches = []\n", + " best_match_methods = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " for i in range(len(combined_fields)):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n", + " \n", + " query = combined_fields[i]\n", + " paragraph_name = paragraph_names[i]\n", + " value_item_name = value_item_names[i]\n", + " \n", + " # 1. 首先检查是否有注释包含ParagraphName\n", + " paragraph_matches = [ann for ann in regular_annotations if paragraph_name in ann]\n", + " \n", + " if paragraph_matches:\n", + " # 2. 如果有包含ParagraphName的注释,再检查是否有同时包含ValueItemName的\n", + " value_matches = [ann for ann in paragraph_matches if value_item_name in ann]\n", + " if value_matches:\n", + " # 找到同时包含ParagraphName和ValueItemName的注释\n", + " best_match = value_matches[0] # 取第一个匹配\n", + " best_match_method = \"精确匹配(段落+值)\"\n", + " else:\n", + " # 只找到包含ParagraphName的注释\n", + " best_match = paragraph_matches[0] # 取第一个匹配\n", + " best_match_method = \"段落匹配\"\n", + " else:\n", + " # 3. 如果没有包含ParagraphName的注释,直接使用相似度指标\n", + " matches, scores = calculate_similarities(query, regular_annotations)\n", + " \n", + " # 选择得分最高的方法\n", + " best_method = max(scores.items(), key=lambda x: x[1])[0]\n", + " best_match = matches[best_method]\n", + " best_match_method = f\"相似度({best_method})\"\n", + " \n", + " # 计算相似度匹配以便比较\n", + " matches, _ = calculate_similarities(query, regular_annotations)\n", + " tfidf_matches.append(matches.get('TF-IDF', \"未匹配\"))\n", + " levenshtein_matches.append(matches.get('Levenshtein', \"未匹配\"))\n", + " fuzzywuzzy_matches.append(matches.get('FuzzyWuzzy', \"未匹配\"))\n", + " \n", + " best_matches.append(best_match)\n", + " best_match_methods.append(best_match_method)\n", + " \n", + " # 获取ValueItemKind列的位置\n", + " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", + " \n", + " # 在ValueItemKind列前插入新的列\n", + " test_df.insert(kind_idx, 'Best_Match_Method', best_match_methods)\n", + " test_df.insert(kind_idx, 'Best_Match', best_matches)\n", + " test_df.insert(kind_idx, 'FuzzyWuzzy_Match', fuzzywuzzy_matches)\n", + " test_df.insert(kind_idx, 'Levenshtein_Match', levenshtein_matches)\n", + " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n", + " \n", + " # 保存结果\n", + " test_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_with_annotations_all3.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " # 打印匹配结果统计\n", + " print(\"\\n最佳匹配方法统计:\")\n", + " print(pd.Series(best_match_methods).value_counts())\n", + " \n", + " print(\"\\n各方法匹配结果统计:\")\n", + " print(\"最佳匹配结果:\")\n", + " print(pd.Series(best_matches).value_counts().head(10))\n", + " print(\"\\nTF-IDF匹配结果:\")\n", + " print(pd.Series(tfidf_matches).value_counts().head(10))\n", + " print(\"\\nLevenshtein匹配结果:\")\n", + " print(pd.Series(levenshtein_matches).value_counts().head(10))\n", + " print(\"\\nFuzzyWuzzy匹配结果:\")\n", + " print(pd.Series(fuzzywuzzy_matches).value_counts().head(10))\n", + " \n", + " # 打印前5行匹配结果示例\n", + " print(\"\\n前5行匹配结果示例:\")\n", + " for i in range(min(5, len(test_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"最佳匹配: {test_df.iloc[i]['Best_Match']} (方法: {test_df.iloc[i]['Best_Match_Method']})\")\n", + " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n", + " print(f\"Levenshtein匹配: {test_df.iloc[i]['Levenshtein_Match']}\")\n", + " print(f\"FuzzyWuzzy匹配: {test_df.iloc[i]['FuzzyWuzzy_Match']}\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Building prefix dict from the default dictionary ...\n", + "Loading model from cache /tmp/jieba.cache\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading model cost 1.175 seconds.\n", + "Prefix dict has been built successfully.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "处理第 100/127 条记录...\n", + "\n", + "最佳匹配方法统计:\n", + "TF-IDF相似度 89\n", + "段落匹配 32\n", + "精确匹配(段落+值) 6\n", + "Name: count, dtype: int64\n", + "\n", + "匹配结果统计:\n", + "最佳匹配结果:\n", + "体格检查-体温 20\n", + "主刀医师 15\n", + "卡号 14\n", + "手术经过 13\n", + "既往史-手术外伤史-手术史-手术时间 10\n", + "患者姓名 7\n", + "患者基本情况 6\n", + "现病史-发病日期 6\n", + "麻醉方式 5\n", + "手术名称 5\n", + "Name: count, dtype: int64\n", + "\n", + "TF-IDF匹配结果:\n", + "主刀医师 15\n", + "卡号 14\n", + "手术经过 13\n", + "既往史-手术外伤史-手术史-手术时间 10\n", + "患者姓名 7\n", + "患者基本情况 6\n", + "手术名称 5\n", + "麻醉方式 5\n", + "参与人员 4\n", + "主诉 3\n", + "Name: count, dtype: int64\n", + "\n", + "前5行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "最佳匹配: 患者姓名 (方法: TF-IDF相似度)\n", + "相似度分数: 0.5819\n", + "TF-IDF匹配: 患者姓名\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "最佳匹配: 患者性别 (方法: TF-IDF相似度)\n", + "相似度分数: 0.6637\n", + "TF-IDF匹配: 患者性别\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "最佳匹配: 患者年龄 (方法: TF-IDF相似度)\n", + "相似度分数: 0.6576\n", + "TF-IDF匹配: 患者年龄\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-主诉-主诉\n", + "最佳匹配: 主诉 (方法: TF-IDF相似度)\n", + "相似度分数: 0.7628\n", + "TF-IDF匹配: 主诉\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-入院日期-入院日期\n", + "最佳匹配: 入院时情况-入院时间 (方法: TF-IDF相似度)\n", + "相似度分数: 0.5297\n", + "TF-IDF匹配: 入院时情况-入院时间\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " paragraph_names = []\n", + " statement_names = []\n", + " value_item_names = []\n", + " \n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " paragraph_names.append(row['ParagraphName'])\n", + " statement_names.append(row['StatementName'])\n", + " value_item_names.append(row['ValueItemName'])\n", + " \n", + " # 定义TF-IDF相似度计算方法\n", + " def calculate_tfidf_similarity(query, candidates):\n", + " \"\"\"计算TF-IDF相似度,返回最佳匹配和分数\"\"\"\n", + " try:\n", + " # 对中文文本进行分词\n", + " segmented_query = ' '.join(jieba.cut(query))\n", + " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", + " \n", + " # 计算TF-IDF向量\n", + " vectorizer = TfidfVectorizer()\n", + " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", + " \n", + " # 计算余弦相似度\n", + " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", + " \n", + " # 找到最佳匹配\n", + " best_idx = np.argmax(cosine_sim)\n", + " return candidates[best_idx], cosine_sim[best_idx]\n", + " except Exception as e:\n", + " print(f\"TF-IDF计算失败: {e}\")\n", + " return \"未匹配\", 0.0\n", + " \n", + " # 对每个测试字段进行匹配\n", + " tfidf_matches = []\n", + " best_matches = []\n", + " best_match_methods = []\n", + " similarity_scores = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " for i in range(len(combined_fields)):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n", + " \n", + " query = combined_fields[i]\n", + " paragraph_name = paragraph_names[i]\n", + " value_item_name = value_item_names[i]\n", + " \n", + " # 1. 首先检查是否有注释包含ParagraphName\n", + " paragraph_matches = [ann for ann in regular_annotations if paragraph_name in ann]\n", + " \n", + " if paragraph_matches:\n", + " # 2. 如果有包含ParagraphName的注释,再检查是否有同时包含ValueItemName的\n", + " value_matches = [ann for ann in paragraph_matches if value_item_name in ann]\n", + " if value_matches:\n", + " # 找到同时包含ParagraphName和ValueItemName的注释\n", + " best_match = value_matches[0] # 取第一个匹配\n", + " best_match_method = \"精确匹配(段落+值)\"\n", + " similarity_score = 1.0 # 精确匹配给予最高分\n", + " else:\n", + " # 只找到包含ParagraphName的注释\n", + " best_match = paragraph_matches[0] # 取第一个匹配\n", + " best_match_method = \"段落匹配\"\n", + " similarity_score = 0.8 # 段落匹配给予较高分\n", + " else:\n", + " # 3. 如果没有包含ParagraphName的注释,使用TF-IDF相似度\n", + " best_match, similarity_score = calculate_tfidf_similarity(query, regular_annotations)\n", + " best_match_method = \"TF-IDF相似度\"\n", + " \n", + " # 计算TF-IDF匹配以便比较\n", + " tfidf_match, _ = calculate_tfidf_similarity(query, regular_annotations)\n", + " tfidf_matches.append(tfidf_match)\n", + " \n", + " best_matches.append(best_match)\n", + " best_match_methods.append(best_match_method)\n", + " similarity_scores.append(similarity_score)\n", + " \n", + " # 获取ValueItemKind列的位置\n", + " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", + " \n", + " # 在ValueItemKind列前插入新的列\n", + " test_df.insert(kind_idx, 'Similarity_Score', similarity_scores)\n", + " test_df.insert(kind_idx, 'Best_Match_Method', best_match_methods)\n", + " test_df.insert(kind_idx, 'Best_Match', best_matches)\n", + " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n", + " \n", + " # 保存结果\n", + " test_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_with_tfidf.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " # 打印匹配结果统计\n", + " print(\"\\n最佳匹配方法统计:\")\n", + " print(pd.Series(best_match_methods).value_counts())\n", + " \n", + " print(\"\\n匹配结果统计:\")\n", + " print(\"最佳匹配结果:\")\n", + " print(pd.Series(best_matches).value_counts().head(10))\n", + " print(\"\\nTF-IDF匹配结果:\")\n", + " print(pd.Series(tfidf_matches).value_counts().head(10))\n", + " \n", + " # 打印前5行匹配结果示例\n", + " print(\"\\n前5行匹配结果示例:\")\n", + " for i in range(min(5, len(test_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"最佳匹配: {test_df.iloc[i]['Best_Match']} (方法: {test_df.iloc[i]['Best_Match_Method']})\")\n", + " print(f\"相似度分数: {test_df.iloc[i]['Similarity_Score']:.4f}\")\n", + " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n", + "处理第 100/127 条记录...\n", + "\n", + "匹配完成,共处理 127 条记录\n", + "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\n", + "\n", + "前3行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "匹配1: 患者姓名 (分数: 0.5819)\n", + "匹配2: 患者姓名 (分数: 0.5819)\n", + "匹配3: 姓名 (分数: 0.5574)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "匹配1: 患者性别 (分数: 0.6637)\n", + "匹配2: 性别 (分数: 0.6416)\n", + "匹配3: 性别 (分数: 0.6416)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "匹配1: 患者年龄 (分数: 0.6576)\n", + "匹配2: 年龄 (分数: 0.6348)\n", + "匹配3: 年龄 (分数: 0.6348)\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " \n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 定义TF-IDF相似度计算方法,返回前3个最佳匹配\n", + " def calculate_top3_tfidf_similarity(query, candidates):\n", + " \"\"\"计算TF-IDF相似度,返回前3个最佳匹配和分数\"\"\"\n", + " try:\n", + " # 对中文文本进行分词\n", + " segmented_query = ' '.join(jieba.cut(query))\n", + " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", + " \n", + " # 计算TF-IDF向量\n", + " vectorizer = TfidfVectorizer()\n", + " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", + " \n", + " # 计算余弦相似度\n", + " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", + " \n", + " # 找到前3个最佳匹配\n", + " top3_indices = np.argsort(cosine_sim)[::-1][:3]\n", + " top3_scores = cosine_sim[top3_indices]\n", + " \n", + " return top3_indices, top3_scores\n", + " except Exception as e:\n", + " print(f\"TF-IDF计算失败: {e}\")\n", + " return [-1, -1, -1], [0.0, 0.0, 0.0]\n", + " \n", + " # 创建结果DataFrame\n", + " result_data = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " for i in range(len(test_df)):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", + " \n", + " query = combined_fields[i]\n", + " \n", + " # 使用TF-IDF相似度匹配,获取前3个最佳匹配\n", + " top3_indices, top3_scores = calculate_top3_tfidf_similarity(query, regular_annotations)\n", + " \n", + " # 获取测试数据的相关字段\n", + " paragraph_name = test_df.iloc[i]['ParagraphName']\n", + " statement_name = test_df.iloc[i]['StatementName']\n", + " value_item_name = test_df.iloc[i]['ValueItemName']\n", + " display_string = test_df.iloc[i]['DisplayString']\n", + " \n", + " # 获取前3个规范数据的相关字段\n", + " regular_nodes = []\n", + " regular_annotations_matched = []\n", + " regular_descriptions = []\n", + " \n", + " for idx, score in zip(top3_indices, top3_scores):\n", + " if idx >= 0:\n", + " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n", + " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n", + " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n", + " else:\n", + " regular_nodes.append(\"未匹配\")\n", + " regular_annotations_matched.append(\"未匹配\")\n", + " regular_descriptions.append(\"未匹配\")\n", + " \n", + " # 确保有3个结果(如果候选项少于3个)\n", + " while len(regular_nodes) < 3:\n", + " regular_nodes.append(\"未匹配\")\n", + " regular_annotations_matched.append(\"未匹配\")\n", + " regular_descriptions.append(\"未匹配\")\n", + " top3_scores = np.append(top3_scores, 0.0)\n", + " \n", + " # 添加到结果数据\n", + " result_data.append({\n", + " 'ParagraphName': paragraph_name,\n", + " 'StatementName': statement_name,\n", + " 'ValueItemName': value_item_name,\n", + " 'DisplayString': display_string,\n", + " '规范节点名1': regular_nodes[0],\n", + " '规范注释1': regular_annotations_matched[0],\n", + " '规范说明1': regular_descriptions[0],\n", + " '相似度分数1': top3_scores[0],\n", + " '规范节点名2': regular_nodes[1],\n", + " '规范注释2': regular_annotations_matched[1],\n", + " '规范说明2': regular_descriptions[1],\n", + " '相似度分数2': top3_scores[1],\n", + " '规范节点名3': regular_nodes[2],\n", + " '规范注释3': regular_annotations_matched[2],\n", + " '规范说明3': regular_descriptions[2],\n", + " '相似度分数3': top3_scores[2]\n", + " })\n", + " \n", + " # 创建结果DataFrame\n", + " result_df = pd.DataFrame(result_data)\n", + " \n", + " # 保存结果\n", + " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", + " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\")\n", + " \n", + " # 打印前3行匹配结果示例\n", + " print(\"\\n前3行匹配结果示例:\")\n", + " for i in range(min(3, len(result_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n", + " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n", + " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "加载Sentence Transformer模型...\n", + "对规范注释进行编码...\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n", + "处理第 100/127 条记录...\n", + "\n", + "匹配完成,共处理 127 条记录\n", + "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv\n", + "\n", + "前3行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "匹配1: 讨论经过-病理科-医师姓名 (分数: 0.8398)\n", + "匹配2: 讨论经过-放射科-医师姓名 (分数: 0.8398)\n", + "匹配3: 讨论经过-放化疗科-医师姓名 (分数: 0.8384)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "匹配1: 辅助检查-乙肝病毒-医院名称 (分数: 0.8360)\n", + "匹配2: 辅助检查-骨扫描检查-医院名称 (分数: 0.8325)\n", + "匹配3: 讨论经过-病理科-医师姓名 (分数: 0.8265)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "匹配1: 入院时情况-患者年龄 (分数: 0.8390)\n", + "匹配2: 辅助检查-乙肝病毒-医院名称 (分数: 0.8046)\n", + "匹配3: 辅助检查-骨扫描检查-医院名称 (分数: 0.8012)\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import torch\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " \n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 加载预训练的中文Sentence Transformer模型\n", + " print(\"加载Sentence Transformer模型...\")\n", + " \n", + " # 如果都失败,尝试加载基础模型\n", + " model = SentenceTransformer('/home/limeng/SICT/lung_test/all-MiniLM-L6-v2')\n", + "\n", + " # 定义Sentence Transformer相似度计算方法,返回前3个最佳匹配\n", + " def calculate_top3_transformer_similarity(query, candidates, model):\n", + " \"\"\"计算Sentence Transformer相似度,返回前3个最佳匹配和分数\"\"\"\n", + " try:\n", + " # 编码查询和候选项\n", + " query_embedding = model.encode([query], convert_to_tensor=True)\n", + " candidate_embeddings = model.encode(candidates, convert_to_tensor=True)\n", + " \n", + " # 计算余弦相似度\n", + " cosine_scores = cosine_similarity(\n", + " query_embedding.cpu().numpy(), \n", + " candidate_embeddings.cpu().numpy()\n", + " )[0]\n", + " \n", + " # 找到前3个最佳匹配\n", + " top3_indices = np.argsort(cosine_scores)[::-1][:3]\n", + " top3_scores = cosine_scores[top3_indices]\n", + " \n", + " return top3_indices, top3_scores\n", + " except Exception as e:\n", + " print(f\"Transformer相似度计算失败: {e}\")\n", + " return [-1, -1, -1], [0.0, 0.0, 0.0]\n", + " \n", + " # 创建结果DataFrame\n", + " result_data = []\n", + " \n", + " # 先对规范注释进行编码,避免重复计算\n", + " print(\"对规范注释进行编码...\")\n", + " \n", + " # 创建结果DataFrame\n", + " result_data = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " for i in range(len(test_df)):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", + " \n", + " query = combined_fields[i]\n", + " \n", + " # 使用Sentence Transformer相似度匹配,获取前3个最佳匹配\n", + " top3_indices, top3_scores = calculate_top3_transformer_similarity(query, regular_annotations, model)\n", + " \n", + " # 获取测试数据的相关字段\n", + " paragraph_name = test_df.iloc[i]['ParagraphName']\n", + " statement_name = test_df.iloc[i]['StatementName']\n", + " value_item_name = test_df.iloc[i]['ValueItemName']\n", + " display_string = test_df.iloc[i]['DisplayString']\n", + " \n", + " # 获取前3个规范数据的相关字段\n", + " regular_nodes = []\n", + " regular_annotations_matched = []\n", + " regular_descriptions = []\n", + " \n", + " for idx, score in zip(top3_indices, top3_scores):\n", + " if idx >= 0:\n", + " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n", + " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n", + " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n", + " else:\n", + " regular_nodes.append(\"未匹配\")\n", + " regular_annotations_matched.append(\"未匹配\")\n", + " regular_descriptions.append(\"未匹配\")\n", + " \n", + " # 确保有3个结果(如果候选项少于3个)\n", + " while len(regular_nodes) < 3:\n", + " regular_nodes.append(\"未匹配\")\n", + " regular_annotations_matched.append(\"未匹配\")\n", + " regular_descriptions.append(\"未匹配\")\n", + " top3_scores = np.append(top3_scores, 0.0)\n", + " \n", + " # 添加到结果数据\n", + " result_data.append({\n", + " 'ParagraphName': paragraph_name,\n", + " 'StatementName': statement_name,\n", + " 'ValueItemName': value_item_name,\n", + " 'DisplayString': display_string,\n", + " '规范节点名1': regular_nodes[0],\n", + " '规范注释1': regular_annotations_matched[0],\n", + " '规范说明1': regular_descriptions[0],\n", + " '相似度分数1': top3_scores[0],\n", + " '规范节点名2': regular_nodes[1],\n", + " '规范注释2': regular_annotations_matched[1],\n", + " '规范说明2': regular_descriptions[1],\n", + " '相似度分数2': top3_scores[1],\n", + " '规范节点名3': regular_nodes[2],\n", + " '规范注释3': regular_annotations_matched[2],\n", + " '规范说明3': regular_descriptions[2],\n", + " '相似度分数3': top3_scores[2]\n", + " })\n", + " \n", + " # 创建结果DataFrame\n", + " result_df = pd.DataFrame(result_data)\n", + " \n", + " # 保存结果\n", + " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", + " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv\")\n", + " \n", + " # 打印前3行匹配结果示例\n", + " print(\"\\n前3行匹配结果示例:\")\n", + " for i in range(min(3, len(result_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n", + " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n", + " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "初始化TF-IDF向量化器...\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", + " warnings.warn(\n", + "Building prefix dict from the default dictionary ...\n", + "Dumping model to file cache /tmp/jieba.cache\n", + "Loading model cost 0.934 seconds.\n", + "Prefix dict has been built successfully.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "处理第 100/127 条记录...\n", + "\n", + "匹配完成,共处理 127 条记录\n", + "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\n", + "\n", + "前3行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "匹配1: 患者姓名 (分数: 0.4840)\n", + "匹配2: 患者姓名 (分数: 0.4840)\n", + "匹配3: 姓名 (分数: 0.4637)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "匹配1: 患者性别 (分数: 0.5414)\n", + "匹配2: 性别 (分数: 0.5235)\n", + "匹配3: 性别 (分数: 0.5235)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "匹配1: 患者年龄 (分数: 0.5357)\n", + "匹配2: 年龄 (分数: 0.5170)\n", + "匹配3: 年龄 (分数: 0.5170)\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " \n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 使用TF-IDF向量化文本\n", + " print(\"初始化TF-IDF向量化器...\")\n", + " \n", + " # 对中文文本进行分词处理\n", + " def tokenize_chinese(text):\n", + " return list(jieba.cut(text))\n", + " \n", + " # 初始化TF-IDF向量化器\n", + " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n", + " \n", + " # 定义TF-IDF相似度计算方法,返回前3个最佳匹配\n", + " def calculate_top3_tfidf_similarity(query, candidates, vectorizer):\n", + " \"\"\"计算TF-IDF相似度,返回前3个最佳匹配和分数\"\"\"\n", + " try:\n", + " # 将所有文本合并为一个列表进行向量化\n", + " all_texts = [query] + candidates\n", + " \n", + " # 拟合并转换所有文本\n", + " tfidf_matrix = vectorizer.fit_transform(all_texts)\n", + " \n", + " # 计算查询与所有候选项的余弦相似度\n", + " query_vector = tfidf_matrix[0:1]\n", + " candidate_vectors = tfidf_matrix[1:]\n", + " \n", + " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n", + " \n", + " # 找到前3个最佳匹配\n", + " top3_indices = np.argsort(cosine_scores)[::-1][:3]\n", + " top3_scores = cosine_scores[top3_indices]\n", + " \n", + " return top3_indices, top3_scores\n", + " except Exception as e:\n", + " print(f\"TF-IDF相似度计算失败: {e}\")\n", + " return [-1, -1, -1], [0.0, 0.0, 0.0]\n", + " \n", + " # 创建结果DataFrame\n", + " result_data = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " for i in range(len(test_df)):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", + " \n", + " query = combined_fields[i]\n", + " \n", + " # 使用TF-IDF相似度匹配,获取前3个最佳匹配\n", + " top3_indices, top3_scores = calculate_top3_tfidf_similarity(query, regular_annotations, tfidf_vectorizer)\n", + " \n", + " # 获取测试数据的相关字段\n", + " paragraph_name = test_df.iloc[i]['ParagraphName']\n", + " statement_name = test_df.iloc[i]['StatementName']\n", + " value_item_name = test_df.iloc[i]['ValueItemName']\n", + " display_string = test_df.iloc[i]['DisplayString']\n", + " \n", + " # 获取前3个规范数据的相关字段\n", + " regular_nodes = []\n", + " regular_annotations_matched = []\n", + " regular_descriptions = []\n", + " \n", + " for idx, score in zip(top3_indices, top3_scores):\n", + " if idx >= 0:\n", + " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n", + " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n", + " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n", + " else:\n", + " regular_nodes.append(\"未匹配\")\n", + " regular_annotations_matched.append(\"未匹配\")\n", + " regular_descriptions.append(\"未匹配\")\n", + " \n", + " # 确保有3个结果(如果候选项少于3个)\n", + " while len(regular_nodes) < 3:\n", + " regular_nodes.append(\"未匹配\")\n", + " regular_annotations_matched.append(\"未匹配\")\n", + " regular_descriptions.append(\"未匹配\")\n", + " top3_scores = np.append(top3_scores, 0.0)\n", + " \n", + " # 添加到结果数据\n", + " result_data.append({\n", + " 'ParagraphName': paragraph_name,\n", + " 'StatementName': statement_name,\n", + " 'ValueItemName': value_item_name,\n", + " 'DisplayString': display_string,\n", + " '规范节点名1': regular_nodes[0],\n", + " '规范注释1': regular_annotations_matched[0],\n", + " '规范说明1': regular_descriptions[0],\n", + " '相似度分数1': top3_scores[0],\n", + " '规范节点名2': regular_nodes[1],\n", + " '规范注释2': regular_annotations_matched[1],\n", + " '规范说明2': regular_descriptions[1],\n", + " '相似度分数2': top3_scores[1],\n", + " '规范节点名3': regular_nodes[2],\n", + " '规范注释3': regular_annotations_matched[2],\n", + " '规范说明3': regular_descriptions[2],\n", + " '相似度分数3': top3_scores[2]\n", + " })\n", + " \n", + " # 创建结果DataFrame\n", + " result_df = pd.DataFrame(result_data)\n", + " \n", + " # 保存结果\n", + " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", + " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\")\n", + " \n", + " # 打印前3行匹配结果示例\n", + " print(\"\\n前3行匹配结果示例:\")\n", + " for i in range(min(3, len(result_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n", + " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n", + " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "初始化TF-IDF向量化器...\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "处理第 100/127 条记录...\n", + "\n", + "匹配完成,共处理 127 条记录\n", + "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv\n", + "\n", + "前3行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "TF-IDF匹配结果:\n", + " 匹配1: 患者姓名 (分数: 0.4840)\n", + " 匹配2: 患者姓名 (分数: 0.4840)\n", + " 匹配3: 姓名 (分数: 0.4637)\n", + "FuzzyWuzzy匹配结果:\n", + " 匹配1: 患者姓名 (分数: 0.3300)\n", + " 匹配2: 入院时情况-患者姓名 (分数: 0.3300)\n", + " 匹配3: 患者姓名 (分数: 0.3300)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "TF-IDF匹配结果:\n", + " 匹配1: 患者性别 (分数: 0.5414)\n", + " 匹配2: 性别 (分数: 0.5235)\n", + " 匹配3: 性别 (分数: 0.5235)\n", + "FuzzyWuzzy匹配结果:\n", + " 匹配1: 患者姓名 (分数: 0.3600)\n", + " 匹配2: 入院时情况-患者姓名 (分数: 0.3600)\n", + " 匹配3: 患者姓名 (分数: 0.3600)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "TF-IDF匹配结果:\n", + " 匹配1: 患者年龄 (分数: 0.5357)\n", + " 匹配2: 年龄 (分数: 0.5170)\n", + " 匹配3: 年龄 (分数: 0.5170)\n", + "FuzzyWuzzy匹配结果:\n", + " 匹配1: 患者姓名 (分数: 0.3600)\n", + " 匹配2: 入院时情况-患者姓名 (分数: 0.3600)\n", + " 匹配3: 患者姓名 (分数: 0.3600)\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "from fuzzywuzzy import fuzz\n", + "from fuzzywuzzy import process\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " \n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 使用TF-IDF向量化文本\n", + " print(\"初始化TF-IDF向量化器...\")\n", + " \n", + " # 对中文文本进行分词处理\n", + " def tokenize_chinese(text):\n", + " return list(jieba.cut(text))\n", + " \n", + " # 初始化TF-IDF向量化器\n", + " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n", + " \n", + " # 定义TF-IDF相似度计算方法,返回最佳匹配(最多3个)\n", + " def calculate_tfidf_similarity(query, candidates, vectorizer, max_matches=3, threshold=0.1):\n", + " \"\"\"计算TF-IDF相似度,返回最佳匹配(最多max_matches个)\"\"\"\n", + " try:\n", + " # 将所有文本合并为一个列表进行向量化\n", + " all_texts = [query] + candidates\n", + " \n", + " # 拟合并转换所有文本\n", + " tfidf_matrix = vectorizer.fit_transform(all_texts)\n", + " \n", + " # 计算查询与所有候选项的余弦相似度\n", + " query_vector = tfidf_matrix[0:1]\n", + " candidate_vectors = tfidf_matrix[1:]\n", + " \n", + " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n", + " \n", + " # 找到相似度大于阈值的匹配\n", + " valid_indices = np.where(cosine_scores > threshold)[0]\n", + " \n", + " # 按相似度降序排序\n", + " sorted_indices = valid_indices[np.argsort(cosine_scores[valid_indices])[::-1]]\n", + " \n", + " # 最多取max_matches个\n", + " top_indices = sorted_indices[:max_matches]\n", + " top_scores = cosine_scores[top_indices]\n", + " \n", + " return top_indices, top_scores\n", + " except Exception as e:\n", + " print(f\"TF-IDF相似度计算失败: {e}\")\n", + " return np.array([]), np.array([])\n", + " \n", + " # 定义FuzzyWuzzy相似度计算方法,返回最佳匹配(最多3个)\n", + " def calculate_fuzzy_similarity(query, candidates, max_matches=3):\n", + " \"\"\"计算FuzzyWuzzy相似度,返回最佳匹配(最多max_matches个)\"\"\"\n", + " try:\n", + " # 使用process.extract获取最佳匹配\n", + " matches = process.extract(query, candidates, limit=max_matches, scorer=fuzz.token_sort_ratio)\n", + " \n", + " # 提取索引和分数\n", + " indices = []\n", + " scores = []\n", + " \n", + " for match in matches:\n", + " # match格式为(匹配文本, 分数)\n", + " matched_text, score = match\n", + " # 找到匹配文本在原始列表中的索引\n", + " idx = candidates.index(matched_text)\n", + " indices.append(idx)\n", + " scores.append(score / 100.0) # 将分数归一化到0-1范围\n", + " \n", + " return np.array(indices), np.array(scores)\n", + " except Exception as e:\n", + " print(f\"FuzzyWuzzy相似度计算失败: {e}\")\n", + " return np.array([]), np.array([])\n", + " \n", + " # 创建结果DataFrame\n", + " result_data = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " \n", + " for i in range(len(test_df)):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", + " \n", + " query = combined_fields[i]\n", + " \n", + " # 使用TF-IDF相似度匹配,获取最佳匹配(最多3个)\n", + " tfidf_indices, tfidf_scores = calculate_tfidf_similarity(query, regular_annotations, tfidf_vectorizer)\n", + " \n", + " # 使用FuzzyWuzzy相似度匹配,获取最佳匹配(最多3个)\n", + " fuzzy_indices, fuzzy_scores = calculate_fuzzy_similarity(query, regular_annotations)\n", + " \n", + " # 获取测试数据的相关字段\n", + " paragraph_name = test_df.iloc[i]['ParagraphName']\n", + " statement_name = test_df.iloc[i]['StatementName']\n", + " value_item_name = test_df.iloc[i]['ValueItemName']\n", + " display_string = test_df.iloc[i]['DisplayString']\n", + " \n", + " # 创建结果字典\n", + " result_dict = {\n", + " 'ParagraphName': paragraph_name,\n", + " 'StatementName': statement_name,\n", + " 'ValueItemName': value_item_name,\n", + " 'DisplayString': display_string\n", + " }\n", + " \n", + " # 添加TF-IDF匹配结果\n", + " for j in range(min(3, len(tfidf_indices))):\n", + " idx = tfidf_indices[j]\n", + " score = tfidf_scores[j]\n", + " result_dict[f'TFIDF_规范节点名{j+1}'] = regular_df.iloc[idx]['节点名']\n", + " result_dict[f'TFIDF_规范注释{j+1}'] = regular_df.iloc[idx]['注释']\n", + " result_dict[f'TFIDF_规范说明{j+1}'] = regular_df.iloc[idx]['说明']\n", + " result_dict[f'TFIDF_相似度分数{j+1}'] = score\n", + " \n", + " # 添加FuzzyWuzzy匹配结果\n", + " for j in range(min(3, len(fuzzy_indices))):\n", + " idx = fuzzy_indices[j]\n", + " score = fuzzy_scores[j]\n", + " result_dict[f'Fuzzy_规范节点名{j+1}'] = regular_df.iloc[idx]['节点名']\n", + " result_dict[f'Fuzzy_规范注释{j+1}'] = regular_df.iloc[idx]['注释']\n", + " result_dict[f'Fuzzy_规范说明{j+1}'] = regular_df.iloc[idx]['说明']\n", + " result_dict[f'Fuzzy_相似度分数{j+1}'] = score\n", + " \n", + " # 添加到结果数据\n", + " result_data.append(result_dict)\n", + " \n", + " # 创建结果DataFrame\n", + " result_df = pd.DataFrame(result_data)\n", + " \n", + " # 保存结果\n", + " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", + " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv\")\n", + " \n", + " # 打印前3行匹配结果示例\n", + " print(\"\\n前3行匹配结果示例:\")\n", + " for i in range(min(3, len(result_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " \n", + " print(\"TF-IDF匹配结果:\")\n", + " for j in range(1, 4):\n", + " if f'TFIDF_规范注释{j}' in result_df.columns and not pd.isna(result_df.iloc[i].get(f'TFIDF_规范注释{j}', None)):\n", + " print(f\" 匹配{j}: {result_df.iloc[i][f'TFIDF_规范注释{j}']} (分数: {result_df.iloc[i][f'TFIDF_相似度分数{j}']:.4f})\")\n", + " \n", + " print(\"FuzzyWuzzy匹配结果:\")\n", + " for j in range(1, 4):\n", + " if f'Fuzzy_规范注释{j}' in result_df.columns and not pd.isna(result_df.iloc[i].get(f'Fuzzy_规范注释{j}', None)):\n", + " print(f\" 匹配{j}: {result_df.iloc[i][f'Fuzzy_规范注释{j}']} (分数: {result_df.iloc[i][f'Fuzzy_相似度分数{j}']:.4f})\")\n", + " \n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", + " warnings.warn(\n", + "Building prefix dict from the default dictionary ...\n", + "Loading model from cache /tmp/jieba.cache\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "初始化TF-IDF向量化器...\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading model cost 0.942 seconds.\n", + "Prefix dict has been built successfully.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "处理第 100/127 条记录...\n", + "\n", + "匹配完成,共处理 127 条记录\n", + "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\n", + "\n", + "前3行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "最佳匹配: 患者姓名 (分数: 0.4840)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "最佳匹配: 患者性别 (分数: 0.5414)\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "最佳匹配: 患者年龄 (分数: 0.5357)\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " \n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 使用TF-IDF向量化文本\n", + " print(\"初始化TF-IDF向量化器...\")\n", + " \n", + " # 对中文文本进行分词处理\n", + " def tokenize_chinese(text):\n", + " return list(jieba.cut(text))\n", + " \n", + " # 初始化TF-IDF向量化器\n", + " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n", + " \n", + " # 定义TF-IDF相似度计算方法,只返回最佳匹配(分数最高的)\n", + " def calculate_best_tfidf_match(query, candidates, vectorizer):\n", + " \"\"\"计算TF-IDF相似度,只返回最佳匹配(分数最高的)\"\"\"\n", + " try:\n", + " # 将所有文本合并为一个列表进行向量化\n", + " all_texts = [query] + candidates\n", + " \n", + " # 拟合并转换所有文本\n", + " tfidf_matrix = vectorizer.fit_transform(all_texts)\n", + " \n", + " # 计算查询与所有候选项的余弦相似度\n", + " query_vector = tfidf_matrix[0:1]\n", + " candidate_vectors = tfidf_matrix[1:]\n", + " \n", + " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n", + " \n", + " # 找到分数最高的匹配\n", + " best_index = np.argmax(cosine_scores)\n", + " best_score = cosine_scores[best_index]\n", + " \n", + " return best_index, best_score\n", + " except Exception as e:\n", + " print(f\"TF-IDF相似度计算失败: {e}\")\n", + " return -1, 0.0\n", + " \n", + " # 创建结果DataFrame\n", + " result_data = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " \n", + " for i in range(len(test_df)):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", + " \n", + " query = combined_fields[i]\n", + " \n", + " # 使用TF-IDF相似度匹配,只获取最佳匹配\n", + " best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n", + " \n", + " # 获取测试数据的相关字段\n", + " row = test_df.iloc[i]\n", + " \n", + " # 创建结果字典,包含原始字段\n", + " result_dict = {\n", + " 'ParagraphName': row['ParagraphName'],\n", + " 'StatementName': row['StatementName'],\n", + " 'ValueItemName': row['ValueItemName'],\n", + " 'DisplayString': row['DisplayString']\n", + " }\n", + " \n", + " # 添加SFZH, XGRQ, IPBLH字段(如果存在)\n", + " if 'SFZH' in test_df.columns:\n", + " result_dict['SFZH'] = row['SFZH']\n", + " if 'XGRQ' in test_df.columns:\n", + " result_dict['XGRQ'] = row['XGRQ']\n", + " if 'IPBLH' in test_df.columns:\n", + " result_dict['IPBLH'] = row['IPBLH']\n", + " \n", + " # 添加最佳TF-IDF匹配结果\n", + " if best_index >= 0:\n", + " result_dict['TFIDF_规范节点名'] = regular_df.iloc[best_index]['节点名']\n", + " result_dict['TFIDF_规范注释'] = regular_df.iloc[best_index]['注释']\n", + " result_dict['TFIDF_规范说明'] = regular_df.iloc[best_index]['说明']\n", + " result_dict['TFIDF_相似度分数'] = best_score\n", + " else:\n", + " result_dict['TFIDF_规范节点名'] = ''\n", + " result_dict['TFIDF_规范注释'] = ''\n", + " result_dict['TFIDF_规范说明'] = ''\n", + " result_dict['TFIDF_相似度分数'] = 0.0\n", + " \n", + " # 添加到结果数据\n", + " result_data.append(result_dict)\n", + " \n", + " # 创建结果DataFrame\n", + " result_df = pd.DataFrame(result_data)\n", + " \n", + " # 保存结果\n", + " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", + " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\")\n", + " \n", + " # 打印前3行匹配结果示例\n", + " print(\"\\n前3行匹配结果示例:\")\n", + " for i in range(min(3, len(result_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"最佳匹配: {result_df.iloc[i]['TFIDF_规范注释']} (分数: {result_df.iloc[i]['TFIDF_相似度分数']:.4f})\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n", + "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n", + "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n", + "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n", + "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n", + "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n", + "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00= 0:\n", + " result_dict['TFIDF_规范节点名'] = regular_df.iloc[best_index]['节点名']\n", + " result_dict['TFIDF_规范注释'] = regular_df.iloc[best_index]['注释']\n", + " result_dict['TFIDF_规范说明'] = regular_df.iloc[best_index]['说明']\n", + " result_dict['TFIDF_相似度分数'] = best_score\n", + " else:\n", + " result_dict['TFIDF_规范节点名'] = ''\n", + " result_dict['TFIDF_规范注释'] = ''\n", + " result_dict['TFIDF_规范说明'] = ''\n", + " result_dict['TFIDF_相似度分数'] = 0.0\n", + " \n", + " # 添加到结果数据\n", + " result_data.append(result_dict)\n", + " \n", + " # 创建结果DataFrame\n", + " result_df = pd.DataFrame(result_data)\n", + " \n", + " # 保存结果\n", + " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv', \n", + " index=False, encoding=test_encoding)\n", + " \n", + " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", + " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\")\n", + " \n", + " # 打印前3行匹配结果示例\n", + " print(\"\\n前3行匹配结果示例:\")\n", + " for i in range(min(3, len(result_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"最佳匹配: {result_df.iloc[i]['TFIDF_规范注释']} (分数: {result_df.iloc[i]['TFIDF_相似度分数']:.4f})\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "测试文件编码: GB2312\n", + "规范文件编码: utf-8\n", + "文件成功读取!\n", + "\n", + "测试文件的列名:\n", + "['Id', 'PatientName', 'IPBLH', 'OPBLH', 'KH', 'KLX', 'SexId', 'CSRQ', 'ZJLX', 'SFZH', 'HYZK', 'ZYDM', 'GJDM', 'MZDM', 'JZDZ', 'YB', 'JG', 'HKDZ', 'DHHM', 'SJHM', 'ABOBloodTypeId', 'LXRXM', 'LXRGX', 'LXRDH', 'ZLLB', 'ZLMC', 'XGRQ', 'YJLXH', 'RYSJ', 'FolderName', 'Xh', 'RecordXh', 'FolderId', 'DocumentName', 'InstanceId', 'DocumentId', 'ParagraphId', 'ParagraphName', 'StatementId', 'StatementName', 'ValueId', 'ValueItemName', 'ValueItemKind', 'RealValue', 'ValueString', 'DisplayString', 'ValuePostfix', 'WSJLSCSJ', 'WSJLXGSJ', 'upload_time']\n", + "\n", + "初始化TF-IDF向量化器...\n", + "开始匹配注释...\n", + "处理第 0/127 条记录...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "处理第 100/127 条记录...\n", + "\n", + "匹配完成,共处理 127 条记录\n", + "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_完整字段_tfidf_match.csv\n", + "\n", + "结果文件的列名:\n", + "['Id', 'PatientName', 'IPBLH', 'OPBLH', 'KH', 'KLX', 'SexId', 'CSRQ', 'ZJLX', 'SFZH', 'HYZK', 'ZYDM', 'GJDM', 'MZDM', 'JZDZ', 'YB', 'JG', 'HKDZ', 'DHHM', 'SJHM', 'ABOBloodTypeId', 'LXRXM', 'LXRGX', 'LXRDH', 'ZLLB', 'ZLMC', 'XGRQ', 'YJLXH', 'RYSJ', 'FolderName', 'Xh', 'RecordXh', 'FolderId', 'DocumentName', 'InstanceId', 'DocumentId', 'ParagraphId', 'ParagraphName', 'StatementId', 'StatementName', 'ValueId', 'ValueItemName', 'ValueItemKind', 'RealValue', 'ValueString', 'DisplayString', 'ValuePostfix', 'WSJLSCSJ', 'WSJLXGSJ', 'upload_time', '规范节点名', '规范注释', '规范说明', 'processed_string']\n", + "\n", + "前3行匹配结果示例:\n", + "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", + "最佳匹配: 患者姓名\n", + "处理后字符串: 测试\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", + "最佳匹配: 患者性别\n", + "处理后字符串: 女\n", + "--------------------------------------------------\n", + "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", + "最佳匹配: 患者年龄\n", + "处理后字符串: 22岁\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import chardet\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import jieba\n", + "\n", + "# 首先检测文件的实际编码\n", + "def detect_encoding(file_path):\n", + " with open(file_path, 'rb') as f:\n", + " result = chardet.detect(f.read())\n", + " return result['encoding']\n", + "\n", + "# 检测文件编码\n", + "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", + "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", + "\n", + "test_encoding = detect_encoding(test_file)\n", + "regular_encoding = detect_encoding(regular_file)\n", + "\n", + "print(f\"测试文件编码: {test_encoding}\")\n", + "print(f\"规范文件编码: {regular_encoding}\")\n", + "\n", + "# 尝试使用检测到的编码读取文件\n", + "try:\n", + " # 读取规范文件\n", + " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", + " \n", + " # 读取测试数据\n", + " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", + " \n", + " print(\"文件成功读取!\")\n", + "except Exception as e:\n", + " print(f\"使用检测到的编码读取失败: {e}\")\n", + " \n", + " # 尝试其他常见编码\n", + " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", + " \n", + " for enc in encodings:\n", + " try:\n", + " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", + " test_df = pd.read_csv(test_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取测试文件\")\n", + " \n", + " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", + " regular_df = pd.read_csv(regular_file, encoding=enc)\n", + " print(f\"成功使用 {enc} 读取规范文件\")\n", + " \n", + " test_encoding = enc\n", + " regular_encoding = enc\n", + " break\n", + " except Exception as e:\n", + " print(f\"使用 {enc} 读取失败: {e}\")\n", + "\n", + "# 如果成功读取文件,继续处理\n", + "if 'test_df' in locals() and 'regular_df' in locals():\n", + " # 打印测试文件的列名,以供参考\n", + " print(\"\\n测试文件的列名:\")\n", + " print(test_df.columns.tolist())\n", + " \n", + " # 创建规范字典,键为注释,值为对应的规则\n", + " regular_annotations = regular_df['注释'].tolist()\n", + " \n", + " # 准备测试数据中的字段组合\n", + " combined_fields = []\n", + " \n", + " for _, row in test_df.iterrows():\n", + " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", + " combined_fields.append(combined_field)\n", + " \n", + " # 使用TF-IDF向量化文本\n", + " print(\"\\n初始化TF-IDF向量化器...\")\n", + " \n", + " # 对中文文本进行分词处理\n", + " def tokenize_chinese(text):\n", + " return list(jieba.cut(text))\n", + " \n", + " # 初始化TF-IDF向量化器\n", + " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n", + " \n", + " # 定义TF-IDF相似度计算方法,只返回最佳匹配(分数最高的)\n", + " def calculate_best_tfidf_match(query, candidates, vectorizer):\n", + " \"\"\"计算TF-IDF相似度,只返回最佳匹配(分数最高的)\"\"\"\n", + " try:\n", + " # 将所有文本合并为一个列表进行向量化\n", + " all_texts = [query] + candidates\n", + " \n", + " # 拟合并转换所有文本\n", + " tfidf_matrix = vectorizer.fit_transform(all_texts)\n", + " \n", + " # 计算查询与所有候选项的余弦相似度\n", + " query_vector = tfidf_matrix[0:1]\n", + " candidate_vectors = tfidf_matrix[1:]\n", + " \n", + " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n", + " \n", + " # 找到分数最高的匹配\n", + " best_index = np.argmax(cosine_scores)\n", + " best_score = cosine_scores[best_index]\n", + " \n", + " return best_index, best_score\n", + " except Exception as e:\n", + " print(f\"TF-IDF相似度计算失败: {e}\")\n", + " return -1, 0.0\n", + " \n", + " # 创建结果DataFrame\n", + " result_data = []\n", + " \n", + " print(\"开始匹配注释...\")\n", + " \n", + " for i in range(len(test_df)):\n", + " if i % 100 == 0:\n", + " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", + " \n", + " query = combined_fields[i]\n", + " \n", + " # 使用TF-IDF相似度匹配,只获取最佳匹配\n", + " best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n", + " \n", + " # 获取测试数据的行\n", + " row = test_df.iloc[i]\n", + " \n", + " # 创建结果字典,包含测试数据的所有字段\n", + " result_dict = row.to_dict()\n", + " \n", + " # 添加最佳TF-IDF匹配结果\n", + " if best_index >= 0:\n", + " result_dict['规范节点名'] = regular_df.iloc[best_index]['节点名']\n", + " result_dict['规范注释'] = regular_df.iloc[best_index]['注释']\n", + " result_dict['规范说明'] = regular_df.iloc[best_index]['说明']\n", + " \n", + " \n", + " # 从最佳匹配中提取processed_string值\n", + " # 如果需要对DisplayString进行处理,可以在此添加逻辑\n", + " result_dict['processed_string'] = row['DisplayString']\n", + " else:\n", + " result_dict['规范节点名'] = ''\n", + " result_dict['规范注释'] = ''\n", + " result_dict['规范说明'] = ''\n", + " \n", + " result_dict['processed_string'] = ''\n", + " \n", + " # 添加到结果数据\n", + " result_data.append(result_dict)\n", + " \n", + " # 创建结果DataFrame\n", + " result_df = pd.DataFrame(result_data)\n", + " \n", + " # 重新排列列顺序,将匹配结果放在后面\n", + " all_columns = test_df.columns.tolist() + ['规范节点名', '规范注释', '规范说明', 'processed_string']\n", + " result_df = result_df[all_columns]\n", + " \n", + " # 保存结果\n", + " result_file = '/home/limeng/SICT/lung_test/result/喉癌患者测试样例_完整字段_tfidf_match.csv'\n", + " result_df.to_csv(result_file, index=False, encoding=test_encoding)\n", + " \n", + " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", + " print(f\"结果已保存至: {result_file}\")\n", + " \n", + " # 打印结果DataFrame的列名\n", + " print(\"\\n结果文件的列名:\")\n", + " print(result_df.columns.tolist())\n", + " \n", + " # 打印前3行匹配结果示例\n", + " print(\"\\n前3行匹配结果示例:\")\n", + " for i in range(min(3, len(result_df))):\n", + " print(f\"原始字段: {combined_fields[i]}\")\n", + " print(f\"最佳匹配: {result_df.iloc[i]['规范注释']}\")\n", + " print(f\"处理后字符串: {result_df.iloc[i]['processed_string']}\")\n", + " print(\"-\" * 50)\n", + "else:\n", + " print(\"无法读取文件,请手动检查文件编码\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n", + "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n", + "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n", + "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n", + "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n", + "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n", + "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00