{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
节点名表名节点类型宽度是否必传注释说明
0COC_HBZL_RYJL$kh入院记录字符32Y卡号患者就诊卡卡号
1COC_HBZL_RYJL$klx入院记录字符16Y卡类型参见字典表
2COC_HBZL_RYJL$xgbz入院记录字符1Y修改标志1:正常 2:修改3:撤销
3COC_HBZL_RYJL$yjlxh入院记录字符32Y原纪录序号院内唯一标识
4COC_HBZL_RYJL$hzbh入院记录varchar64Y患者编号NaN
........................
1137COC_HBZL_SFJL$hgnpg随访记录varchar500NaN喉功能评估NaN
1138COC_HBZL_SFJL$zhzlsj随访记录varchar20NaN综合治疗时间【天/月/年】后
1139COC_HBZL_SFJL$zhzlfa随访记录varchar500NaN综合治疗方案NaN
1140COC_HBZL_SFJL$sxz随访记录varchar50NaN书写者NaN
1141COC_HBZL_SFJL$cjsj随访记录varchar20NaN创建时间NaN
\n", "

1142 rows × 7 columns

\n", "
" ], "text/plain": [ " 节点名 表名 节点类型 宽度 是否必传 注释 说明\n", "0 COC_HBZL_RYJL$kh 入院记录 字符 32 Y 卡号 患者就诊卡卡号\n", "1 COC_HBZL_RYJL$klx 入院记录 字符 16 Y 卡类型 参见字典表\n", "2 COC_HBZL_RYJL$xgbz 入院记录 字符 1 Y 修改标志 1:正常 2:修改3:撤销\n", "3 COC_HBZL_RYJL$yjlxh 入院记录 字符 32 Y 原纪录序号 院内唯一标识\n", "4 COC_HBZL_RYJL$hzbh 入院记录 varchar 64 Y 患者编号 NaN\n", "... ... ... ... ... ... ... ...\n", "1137 COC_HBZL_SFJL$hgnpg 随访记录 varchar 500 NaN 喉功能评估 NaN\n", "1138 COC_HBZL_SFJL$zhzlsj 随访记录 varchar 20 NaN 综合治疗时间 【天/月/年】后\n", "1139 COC_HBZL_SFJL$zhzlfa 随访记录 varchar 500 NaN 综合治疗方案 NaN\n", "1140 COC_HBZL_SFJL$sxz 随访记录 varchar 50 NaN 书写者 NaN\n", "1141 COC_HBZL_SFJL$cjsj 随访记录 varchar 20 NaN 创建时间 NaN\n", "\n", "[1142 rows x 7 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_excel('/home/limeng/SICT/lung_test/数据采集接口规范(喉癌).xlsx')\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df.to_csv('regular.csv',index = False)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "匹配结果统计:\n", "未匹配 127\n", "Name: count, dtype: int64\n" ] } ], "source": [ "import pandas as pd\n", "import difflib\n", "\n", "def get_best_match(target, choices):\n", " \"\"\"使用difflib找到最佳匹配\"\"\"\n", " matches = difflib.get_close_matches(target, choices, n=1, cutoff=0.6)\n", " return matches[0] if matches else None\n", "\n", "# 读取规范文件\n", "regular_df = pd.read_csv('/home/limeng/SICT/lung_test/regular.csv')\n", "\n", "# 读取测试数据\n", "test_df = pd.read_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例.csv', encoding='ISO-8859-1')\n", "\n", "# 创建规范字典,键为注释,值为对应的规则\n", "regular_dict = dict(zip(regular_df['注释'], regular_df.to_dict('records')))\n", "\n", "# 创建新的注释列\n", "matched_annotations = []\n", "for _, row in test_df.iterrows():\n", " # 组合三个字段\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " \n", " # 在规范中查找最佳匹配\n", " best_match = get_best_match(combined_field, regular_dict.keys())\n", " matched_annotations.append(best_match if best_match else \"未匹配\")\n", "\n", "# 获取ValueItemKind列的位置\n", "kind_idx = test_df.columns.get_loc('ValueItemKind')\n", "\n", "# 在ValueItemKind列前插入新的注释列\n", "test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n", "\n", "# 保存结果\n", "test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations.csv', index=False)\n", "\n", "# 打印匹配结果统计\n", "print(\"\\n匹配结果统计:\")\n", "print(pd.Series(matched_annotations).value_counts())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "\n", "匹配结果统计:\n", "未匹配 123\n", "现病史-精神状态 1\n", "体格检查-精神状态 1\n", "体格检查-呼吸 1\n", "体格检查-查体 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "import pandas as pd\n", "import difflib\n", "import chardet\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " def get_best_match(target, choices):\n", " \"\"\"使用difflib找到最佳匹配\"\"\"\n", " matches = difflib.get_close_matches(target, choices, n=1, cutoff=0.6)\n", " return matches[0] if matches else None\n", "\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_dict = dict(zip(regular_df['注释'], regular_df.to_dict('records')))\n", "\n", " # 创建新的注释列\n", " matched_annotations = []\n", " for _, row in test_df.iterrows():\n", " # 组合三个字段\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " \n", " # 在规范中查找最佳匹配\n", " best_match = get_best_match(combined_field, regular_dict.keys())\n", " matched_annotations.append(best_match if best_match else \"未匹配\")\n", "\n", " # 获取ValueItemKind列的位置\n", " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", "\n", " # 在ValueItemKind列前插入新的注释列\n", " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n", "\n", " # 保存结果\n", " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations.csv', \n", " index=False, encoding=test_encoding)\n", "\n", " # 打印匹配结果统计\n", " print(\"\\n匹配结果统计:\")\n", " print(pd.Series(matched_annotations).value_counts())\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "加载Sentence Transformer模型...\n", "计算规范注释的嵌入向量...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Batches: 100%|██████████| 36/36 [00:01<00:00, 24.81it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "开始匹配注释...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Batches: 100%|██████████| 4/4 [00:00<00:00, 113.98it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "匹配结果统计:\n", "未匹配 14\n", "手术类型 10\n", "皮肤黏膜-皮下出血 9\n", "辅助检查-乙肝病毒-医院名称 9\n", "患者基本情况 5\n", "主任医师签名日期 5\n", "手术医师签名 5\n", "下一步治疗方案-具体方案 5\n", "发起科室参与人员-主任医师 5\n", "一般情况-主要症状及体征-姓名 5\n", "一般情况-发育 3\n", "中断放疗-是否 3\n", "个人史-疫源接触史-接触时间 3\n", "甲状腺-左侧甲状腺包块-有无 3\n", "第一助手 2\n", "第二助手 2\n", "出院时情况 2\n", "以下血管结构可见肿瘤包饶-颈总动脉 2\n", "现病史-精神状态 2\n", "入院时情况-主诉 2\n", "病理报告-检查日期 2\n", "入院时情况-患者年龄 2\n", "讨论经过-病理科-医师姓名 2\n", "系统回顾-运动骨骼系统-关节肿痛-时长 2\n", "辅助检查-心电图-检查结论 2\n", "颈部-颈部气管切开-硅胶气管筒 2\n", "既往史-过敏史-药物食物过敏源 1\n", "一般情况-神志 1\n", "现病史-大便 1\n", "系统回顾-泌尿系统-排尿困难 1\n", "系统回顾-神经精神系统-癫痫 1\n", "脊柱四肢-关节活动 1\n", "咽-喉咽-喉咽后壁新生物-形态 1\n", "讨论经过-病理科-病理结果 1\n", "环后区-其他描述 1\n", "系统回顾-泌尿系统-排尿困难-服用药物 1\n", "系统回顾-血液系统-鼻衄史-目前清理 1\n", "记录医师签名日期 1\n", "一般情况-主要症状及体征-主诉 1\n", "喉部增强CT-CT号 1\n", "术前常规化验-化验日期 1\n", "主治医师签名 1\n", "主任医师签名 1\n", "月经史-月经周期 1\n", "术前常规化验-化验单位 1\n", "Name: count, dtype: int64\n", "\n", "前5行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "匹配注释: 讨论经过-病理科-医师姓名\n", "相似度分数: 0.8398\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "匹配注释: 辅助检查-乙肝病毒-医院名称\n", "相似度分数: 0.8360\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "匹配注释: 入院时情况-患者年龄\n", "相似度分数: 0.8390\n", "--------------------------------------------------\n", "原始字段: 病例特点-主诉-主诉\n", "匹配注释: 入院时情况-主诉\n", "相似度分数: 0.9434\n", "--------------------------------------------------\n", "原始字段: 病例特点-入院日期-入院日期\n", "匹配注释: 病理报告-检查日期\n", "相似度分数: 0.9574\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sentence_transformers import SentenceTransformer\n", "import chardet\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 加载预训练的中文Sentence Transformer模型\n", " print(\"加载Sentence Transformer模型...\")\n", " model = SentenceTransformer('/home/limeng/SICT/lung_test/all-MiniLM-L6-v2') # 多语言模型,支持中文\n", " \n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 计算规范注释的嵌入向量\n", " print(\"计算规范注释的嵌入向量...\")\n", " regular_embeddings = model.encode(regular_annotations, show_progress_bar=True)\n", " \n", " # 创建新的注释列\n", " matched_annotations = []\n", " matched_scores = []\n", " \n", " print(\"开始匹配注释...\")\n", " # 批量处理测试数据中的字段组合\n", " combined_fields = []\n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 计算测试数据的嵌入向量\n", " test_embeddings = model.encode(combined_fields, show_progress_bar=True)\n", " \n", " # 计算相似度并找到最佳匹配\n", " for i, test_embedding in enumerate(test_embeddings):\n", " # 计算与所有规范注释的余弦相似度\n", " similarities = np.dot(regular_embeddings, test_embedding) / (\n", " np.linalg.norm(regular_embeddings, axis=1) * np.linalg.norm(test_embedding)\n", " )\n", " \n", " # 找到最佳匹配\n", " best_match_idx = np.argmax(similarities)\n", " best_match_score = similarities[best_match_idx]\n", " \n", " # 如果相似度低于阈值,标记为未匹配\n", " if best_match_score < 0.5: # 可以调整这个阈值\n", " matched_annotations.append(\"未匹配\")\n", " matched_scores.append(0.0)\n", " else:\n", " matched_annotations.append(regular_annotations[best_match_idx])\n", " matched_scores.append(best_match_score)\n", " \n", " # 获取ValueItemKind列的位置\n", " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", " \n", " # 在ValueItemKind列前插入新的注释列和相似度分数列\n", " test_df.insert(kind_idx, 'Matched_Score', matched_scores)\n", " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n", " \n", " # 保存结果\n", " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_transformer.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " # 打印匹配结果统计\n", " print(\"\\n匹配结果统计:\")\n", " print(pd.Series(matched_annotations).value_counts())\n", " \n", " # 打印前5行匹配结果示例\n", " print(\"\\n前5行匹配结果示例:\")\n", " for i in range(min(5, len(test_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"匹配注释: {test_df.iloc[i]['Matched_Annotation']}\")\n", " print(f\"相似度分数: {test_df.iloc[i]['Matched_Score']:.4f}\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Dumping model to file cache /tmp/jieba.cache\n", "Loading model cost 0.792 seconds.\n", "Prefix dict has been built successfully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "处理第 100/127 条记录...\n", "\n", "匹配结果统计:\n", "体格检查 20\n", "未匹配 20\n", "手术经过 13\n", "姓名 11\n", "主刀医师 10\n", "现病史 7\n", "性别 5\n", "手术名称 5\n", "小结时间 5\n", "麻醉方式 5\n", "Name: count, dtype: int64\n", "\n", "匹配方法使用统计:\n", "Partial 100\n", "None 20\n", "Levenshtein 5\n", "TF-IDF 2\n", "Name: count, dtype: int64\n", "\n", "前5行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "匹配注释: 姓名\n", "相似度分数: 1.0000\n", "匹配方法: Partial\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "匹配注释: 姓名\n", "相似度分数: 1.0000\n", "匹配方法: Partial\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "匹配注释: 姓名\n", "相似度分数: 1.0000\n", "匹配方法: Partial\n", "--------------------------------------------------\n", "原始字段: 病例特点-主诉-主诉\n", "匹配注释: 主诉\n", "相似度分数: 1.0000\n", "匹配方法: Partial\n", "--------------------------------------------------\n", "原始字段: 病例特点-入院日期-入院日期\n", "匹配注释: 病理报告-检查日期\n", "相似度分数: 0.6774\n", "匹配方法: Levenshtein\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "import Levenshtein\n", "from fuzzywuzzy import fuzz\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 定义多种相似度计算方法\n", " def calculate_similarities(query, candidates):\n", " \"\"\"计算多种相似度指标\"\"\"\n", " results = []\n", " \n", " # 1. TF-IDF + 余弦相似度\n", " try:\n", " # 对中文文本进行分词\n", " segmented_query = ' '.join(jieba.cut(query))\n", " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", " \n", " # 计算TF-IDF向量\n", " vectorizer = TfidfVectorizer()\n", " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", " \n", " # 计算余弦相似度\n", " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", " \n", " # 找到最佳匹配\n", " best_idx_tfidf = np.argmax(cosine_sim)\n", " best_score_tfidf = cosine_sim[best_idx_tfidf]\n", " results.append((candidates[best_idx_tfidf], best_score_tfidf, \"TF-IDF\"))\n", " except Exception as e:\n", " print(f\"TF-IDF计算失败: {e}\")\n", " \n", " # 2. Levenshtein距离(编辑距离)\n", " try:\n", " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n", " # 将距离转换为相似度分数(越小越相似)\n", " max_len = max(len(query), max(len(c) for c in candidates))\n", " lev_similarities = [1 - dist/max_len for dist in lev_distances]\n", " \n", " best_idx_lev = np.argmax(lev_similarities)\n", " best_score_lev = lev_similarities[best_idx_lev]\n", " results.append((candidates[best_idx_lev], best_score_lev, \"Levenshtein\"))\n", " except Exception as e:\n", " print(f\"Levenshtein计算失败: {e}\")\n", " \n", " # 3. FuzzyWuzzy比率\n", " try:\n", " fuzzy_ratios = [fuzz.ratio(query, c)/100 for c in candidates]\n", " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n", " best_score_fuzzy = fuzzy_ratios[best_idx_fuzzy]\n", " results.append((candidates[best_idx_fuzzy], best_score_fuzzy, \"FuzzyWuzzy\"))\n", " except Exception as e:\n", " print(f\"FuzzyWuzzy计算失败: {e}\")\n", " \n", " # 4. FuzzyWuzzy部分比率(处理子字符串)\n", " try:\n", " partial_ratios = [fuzz.partial_ratio(query, c)/100 for c in candidates]\n", " best_idx_partial = np.argmax(partial_ratios)\n", " best_score_partial = partial_ratios[best_idx_partial]\n", " results.append((candidates[best_idx_partial], best_score_partial, \"Partial\"))\n", " except Exception as e:\n", " print(f\"Partial比率计算失败: {e}\")\n", " \n", " # 5. FuzzyWuzzy令牌排序比率(处理词序不同)\n", " try:\n", " token_sort_ratios = [fuzz.token_sort_ratio(query, c)/100 for c in candidates]\n", " best_idx_token = np.argmax(token_sort_ratios)\n", " best_score_token = token_sort_ratios[best_idx_token]\n", " results.append((candidates[best_idx_token], best_score_token, \"TokenSort\"))\n", " except Exception as e:\n", " print(f\"TokenSort比率计算失败: {e}\")\n", " \n", " # 找出所有方法中得分最高的结果\n", " best_result = max(results, key=lambda x: x[1]) if results else (None, 0, None)\n", " \n", " return best_result\n", " \n", " # 对每个测试字段进行匹配\n", " matched_annotations = []\n", " matched_scores = []\n", " matched_methods = []\n", " \n", " print(\"开始匹配注释...\")\n", " for i, query in enumerate(combined_fields):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n", " \n", " # 计算多种相似度并选择最佳匹配\n", " best_match, best_score, best_method = calculate_similarities(query, regular_annotations)\n", " \n", " # 如果相似度低于阈值,标记为未匹配\n", " if best_score < 0.6: # 可以调整这个阈值\n", " matched_annotations.append(\"未匹配\")\n", " matched_scores.append(0.0)\n", " matched_methods.append(\"None\")\n", " else:\n", " matched_annotations.append(best_match)\n", " matched_scores.append(best_score)\n", " matched_methods.append(best_method)\n", " \n", " # 获取ValueItemKind列的位置\n", " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", " \n", " # 在ValueItemKind列前插入新的列\n", " test_df.insert(kind_idx, 'Matched_Method', matched_methods)\n", " test_df.insert(kind_idx, 'Matched_Score', matched_scores)\n", " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n", " \n", " # 保存结果\n", " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_multi.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " # 打印匹配结果统计\n", " print(\"\\n匹配结果统计:\")\n", " print(pd.Series(matched_annotations).value_counts().head(10))\n", " \n", " # 打印方法使用统计\n", " print(\"\\n匹配方法使用统计:\")\n", " print(pd.Series(matched_methods).value_counts())\n", " \n", " # 打印前5行匹配结果示例\n", " print(\"\\n前5行匹配结果示例:\")\n", " for i in range(min(5, len(test_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"匹配注释: {test_df.iloc[i]['Matched_Annotation']}\")\n", " print(f\"相似度分数: {test_df.iloc[i]['Matched_Score']:.4f}\")\n", " print(f\"匹配方法: {test_df.iloc[i]['Matched_Method']}\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n", "处理第 100/127 条记录...\n", "\n", "各方法匹配结果统计:\n", "TF-IDF匹配结果:\n", "主刀医师 15\n", "卡号 14\n", "手术经过 13\n", "既往史-手术外伤史-手术史-手术时间 10\n", "患者姓名 7\n", "患者基本情况 6\n", "手术名称 5\n", "麻醉方式 5\n", "参与人员 4\n", "主诉 3\n", "Name: count, dtype: int64\n", "\n", "Levenshtein匹配结果:\n", "既往史-手术外伤史-手术史-有无 13\n", "体格检查-血压-收缩压 10\n", "既往史-手术外伤史-手术史-手术时间 10\n", "入院时情况-患者姓名 8\n", "主要辅助检查-实验室检查-Na 8\n", "主治医师签名 6\n", "既往史-手术外伤史-手术史-手术名称 5\n", "患者性别 5\n", "主治医师签名时间 5\n", "个人史-饮酒史-主要饮酒种类 5\n", "Name: count, dtype: int64\n", "\n", "FuzzyWuzzy匹配结果:\n", "讨论经过-耳鼻喉科/眼科-具体手术方案 13\n", "患者姓名 11\n", "既往史-手术外伤史-手术史-手术时间 10\n", "体格检查-血压-收缩压 9\n", "主要辅助检查-实验室检查-Na 8\n", "主治医师签名 6\n", "发起科室参与人员-主任医师 5\n", "麻醉方式 5\n", "患者性别 5\n", "既往史-手术外伤史-手术史-手术名称 5\n", "Name: count, dtype: int64\n", "\n", "前5行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "TF-IDF匹配: 患者姓名\n", "Levenshtein匹配: 患者姓名\n", "FuzzyWuzzy匹配: 患者姓名\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "TF-IDF匹配: 患者性别\n", "Levenshtein匹配: 患者姓名\n", "FuzzyWuzzy匹配: 患者姓名\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "TF-IDF匹配: 患者年龄\n", "Levenshtein匹配: 患者姓名\n", "FuzzyWuzzy匹配: 患者姓名\n", "--------------------------------------------------\n", "原始字段: 病例特点-主诉-主诉\n", "TF-IDF匹配: 主诉\n", "Levenshtein匹配: 入院时情况-主诉\n", "FuzzyWuzzy匹配: 一般情况-主要症状及体征-主诉\n", "--------------------------------------------------\n", "原始字段: 病例特点-入院日期-入院日期\n", "TF-IDF匹配: 入院时情况-入院时间\n", "Levenshtein匹配: 病理报告-检查日期\n", "FuzzyWuzzy匹配: 现病史-外院手术日期\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "import Levenshtein\n", "from fuzzywuzzy import fuzz\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 定义多种相似度计算方法\n", " def calculate_similarities(query, candidates):\n", " \"\"\"计算多种相似度指标,返回每种方法的最佳匹配\"\"\"\n", " results = {}\n", " \n", " # 1. TF-IDF + 余弦相似度\n", " try:\n", " # 对中文文本进行分词\n", " segmented_query = ' '.join(jieba.cut(query))\n", " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", " \n", " # 计算TF-IDF向量\n", " vectorizer = TfidfVectorizer()\n", " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", " \n", " # 计算余弦相似度\n", " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", " \n", " # 找到最佳匹配\n", " best_idx_tfidf = np.argmax(cosine_sim)\n", " results['TF-IDF'] = candidates[best_idx_tfidf]\n", " except Exception as e:\n", " print(f\"TF-IDF计算失败: {e}\")\n", " results['TF-IDF'] = \"未匹配\"\n", " \n", " # 2. Levenshtein距离(编辑距离)\n", " try:\n", " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n", " # 将距离转换为相似度分数(越小越相似)\n", " best_idx_lev = np.argmin(lev_distances)\n", " results['Levenshtein'] = candidates[best_idx_lev]\n", " except Exception as e:\n", " print(f\"Levenshtein计算失败: {e}\")\n", " results['Levenshtein'] = \"未匹配\"\n", " \n", " # 3. FuzzyWuzzy比率\n", " try:\n", " fuzzy_ratios = [fuzz.ratio(query, c) for c in candidates]\n", " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n", " results['FuzzyWuzzy'] = candidates[best_idx_fuzzy]\n", " except Exception as e:\n", " print(f\"FuzzyWuzzy计算失败: {e}\")\n", " results['FuzzyWuzzy'] = \"未匹配\"\n", " \n", " # 4. FuzzyWuzzy部分比率(处理子字符串)\n", " try:\n", " partial_ratios = [fuzz.partial_ratio(query, c) for c in candidates]\n", " best_idx_partial = np.argmax(partial_ratios)\n", " results['Partial'] = candidates[best_idx_partial]\n", " except Exception as e:\n", " print(f\"Partial比率计算失败: {e}\")\n", " results['Partial'] = \"未匹配\"\n", " \n", " # 5. FuzzyWuzzy令牌排序比率(处理词序不同)\n", " try:\n", " token_sort_ratios = [fuzz.token_sort_ratio(query, c) for c in candidates]\n", " best_idx_token = np.argmax(token_sort_ratios)\n", " results['TokenSort'] = candidates[best_idx_token]\n", " except Exception as e:\n", " print(f\"TokenSort比率计算失败: {e}\")\n", " results['TokenSort'] = \"未匹配\"\n", " \n", " return results\n", " \n", " # 对每个测试字段进行匹配\n", " tfidf_matches = []\n", " levenshtein_matches = []\n", " fuzzywuzzy_matches = []\n", " \n", " print(\"开始匹配注释...\")\n", " for i, query in enumerate(combined_fields):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n", " \n", " # 计算多种相似度\n", " matches = calculate_similarities(query, regular_annotations)\n", " \n", " # 保存各种方法的匹配结果\n", " tfidf_matches.append(matches.get('TF-IDF', \"未匹配\"))\n", " levenshtein_matches.append(matches.get('Levenshtein', \"未匹配\"))\n", " fuzzywuzzy_matches.append(matches.get('FuzzyWuzzy', \"未匹配\"))\n", " \n", " # 获取ValueItemKind列的位置\n", " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", " \n", " # 在ValueItemKind列前插入新的列\n", " test_df.insert(kind_idx, 'FuzzyWuzzy_Match', fuzzywuzzy_matches)\n", " test_df.insert(kind_idx, 'Levenshtein_Match', levenshtein_matches)\n", " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n", " \n", " # 保存结果\n", " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_all.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " # 打印匹配结果统计\n", " print(\"\\n各方法匹配结果统计:\")\n", " print(\"TF-IDF匹配结果:\")\n", " print(pd.Series(tfidf_matches).value_counts().head(10))\n", " print(\"\\nLevenshtein匹配结果:\")\n", " print(pd.Series(levenshtein_matches).value_counts().head(10))\n", " print(\"\\nFuzzyWuzzy匹配结果:\")\n", " print(pd.Series(fuzzywuzzy_matches).value_counts().head(10))\n", " \n", " # 打印前5行匹配结果示例\n", " print(\"\\n前5行匹配结果示例:\")\n", " for i in range(min(5, len(test_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n", " print(f\"Levenshtein匹配: {test_df.iloc[i]['Levenshtein_Match']}\")\n", " print(f\"FuzzyWuzzy匹配: {test_df.iloc[i]['FuzzyWuzzy_Match']}\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m 17\u001b[0m test_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 18\u001b[0m regular_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/limeng/SICT/lung_test/data/regular.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 20\u001b[0m test_encoding \u001b[38;5;241m=\u001b[39m \u001b[43mdetect_encoding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtest_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m regular_encoding \u001b[38;5;241m=\u001b[39m detect_encoding(regular_file)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m测试文件编码: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_encoding\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", "Cell \u001b[0;32mIn[1], line 12\u001b[0m, in \u001b[0;36mdetect_encoding\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdetect_encoding\u001b[39m(file_path):\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 13\u001b[0m result \u001b[38;5;241m=\u001b[39m chardet\u001b[38;5;241m.\u001b[39mdetect(f\u001b[38;5;241m.\u001b[39mread())\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", "File \u001b[0;32m~/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 322\u001b[0m )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "import Levenshtein\n", "from fuzzywuzzy import fuzz\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " paragraph_names = []\n", " statement_names = []\n", " value_item_names = []\n", " \n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " paragraph_names.append(row['ParagraphName'])\n", " statement_names.append(row['StatementName'])\n", " value_item_names.append(row['ValueItemName'])\n", " \n", " # 定义多种相似度计算方法\n", " def calculate_similarities(query, candidates):\n", " \"\"\"计算多种相似度指标,返回每种方法的最佳匹配和分数\"\"\"\n", " results = {}\n", " scores = {}\n", " \n", " # 1. TF-IDF + 余弦相似度\n", " try:\n", " # 对中文文本进行分词\n", " segmented_query = ' '.join(jieba.cut(query))\n", " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", " \n", " # 计算TF-IDF向量\n", " vectorizer = TfidfVectorizer()\n", " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", " \n", " # 计算余弦相似度\n", " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", " \n", " # 找到最佳匹配\n", " best_idx_tfidf = np.argmax(cosine_sim)\n", " results['TF-IDF'] = candidates[best_idx_tfidf]\n", " scores['TF-IDF'] = cosine_sim[best_idx_tfidf]\n", " except Exception as e:\n", " print(f\"TF-IDF计算失败: {e}\")\n", " results['TF-IDF'] = \"未匹配\"\n", " scores['TF-IDF'] = 0.0\n", " \n", " # 2. Levenshtein距离(编辑距离)\n", " try:\n", " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n", " # 将距离转换为相似度分数(越小越相似)\n", " max_len = max(len(query), max(len(c) for c in candidates))\n", " lev_similarities = [1 - dist/max_len for dist in lev_distances]\n", " \n", " best_idx_lev = np.argmax(lev_similarities)\n", " results['Levenshtein'] = candidates[best_idx_lev]\n", " scores['Levenshtein'] = lev_similarities[best_idx_lev]\n", " except Exception as e:\n", " print(f\"Levenshtein计算失败: {e}\")\n", " results['Levenshtein'] = \"未匹配\"\n", " scores['Levenshtein'] = 0.0\n", " \n", " # 3. FuzzyWuzzy比率\n", " try:\n", " fuzzy_ratios = [fuzz.ratio(query, c)/100 for c in candidates]\n", " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n", " results['FuzzyWuzzy'] = candidates[best_idx_fuzzy]\n", " scores['FuzzyWuzzy'] = fuzzy_ratios[best_idx_fuzzy]\n", " except Exception as e:\n", " print(f\"FuzzyWuzzy计算失败: {e}\")\n", " results['FuzzyWuzzy'] = \"未匹配\"\n", " scores['FuzzyWuzzy'] = 0.0\n", " \n", " return results, scores\n", " \n", " # 对每个测试字段进行匹配\n", " tfidf_matches = []\n", " levenshtein_matches = []\n", " fuzzywuzzy_matches = []\n", " best_matches = []\n", " best_match_methods = []\n", " \n", " print(\"开始匹配注释...\")\n", " for i in range(len(combined_fields)):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n", " \n", " query = combined_fields[i]\n", " paragraph_name = paragraph_names[i]\n", " value_item_name = value_item_names[i]\n", " \n", " # 1. 首先检查是否有注释包含ParagraphName\n", " paragraph_matches = [ann for ann in regular_annotations if paragraph_name in ann]\n", " \n", " if paragraph_matches:\n", " # 2. 如果有包含ParagraphName的注释,再检查是否有同时包含ValueItemName的\n", " value_matches = [ann for ann in paragraph_matches if value_item_name in ann]\n", " if value_matches:\n", " # 找到同时包含ParagraphName和ValueItemName的注释\n", " best_match = value_matches[0] # 取第一个匹配\n", " best_match_method = \"精确匹配(段落+值)\"\n", " else:\n", " # 只找到包含ParagraphName的注释\n", " best_match = paragraph_matches[0] # 取第一个匹配\n", " best_match_method = \"段落匹配\"\n", " else:\n", " # 3. 如果没有包含ParagraphName的注释,直接使用相似度指标\n", " matches, scores = calculate_similarities(query, regular_annotations)\n", " \n", " # 选择得分最高的方法\n", " best_method = max(scores.items(), key=lambda x: x[1])[0]\n", " best_match = matches[best_method]\n", " best_match_method = f\"相似度({best_method})\"\n", " \n", " # 计算相似度匹配以便比较\n", " matches, _ = calculate_similarities(query, regular_annotations)\n", " tfidf_matches.append(matches.get('TF-IDF', \"未匹配\"))\n", " levenshtein_matches.append(matches.get('Levenshtein', \"未匹配\"))\n", " fuzzywuzzy_matches.append(matches.get('FuzzyWuzzy', \"未匹配\"))\n", " \n", " best_matches.append(best_match)\n", " best_match_methods.append(best_match_method)\n", " \n", " # 获取ValueItemKind列的位置\n", " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", " \n", " # 在ValueItemKind列前插入新的列\n", " test_df.insert(kind_idx, 'Best_Match_Method', best_match_methods)\n", " test_df.insert(kind_idx, 'Best_Match', best_matches)\n", " test_df.insert(kind_idx, 'FuzzyWuzzy_Match', fuzzywuzzy_matches)\n", " test_df.insert(kind_idx, 'Levenshtein_Match', levenshtein_matches)\n", " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n", " \n", " # 保存结果\n", " test_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_with_annotations_all3.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " # 打印匹配结果统计\n", " print(\"\\n最佳匹配方法统计:\")\n", " print(pd.Series(best_match_methods).value_counts())\n", " \n", " print(\"\\n各方法匹配结果统计:\")\n", " print(\"最佳匹配结果:\")\n", " print(pd.Series(best_matches).value_counts().head(10))\n", " print(\"\\nTF-IDF匹配结果:\")\n", " print(pd.Series(tfidf_matches).value_counts().head(10))\n", " print(\"\\nLevenshtein匹配结果:\")\n", " print(pd.Series(levenshtein_matches).value_counts().head(10))\n", " print(\"\\nFuzzyWuzzy匹配结果:\")\n", " print(pd.Series(fuzzywuzzy_matches).value_counts().head(10))\n", " \n", " # 打印前5行匹配结果示例\n", " print(\"\\n前5行匹配结果示例:\")\n", " for i in range(min(5, len(test_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"最佳匹配: {test_df.iloc[i]['Best_Match']} (方法: {test_df.iloc[i]['Best_Match_Method']})\")\n", " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n", " print(f\"Levenshtein匹配: {test_df.iloc[i]['Levenshtein_Match']}\")\n", " print(f\"FuzzyWuzzy匹配: {test_df.iloc[i]['FuzzyWuzzy_Match']}\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n", "Loading model from cache /tmp/jieba.cache\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loading model cost 1.175 seconds.\n", "Prefix dict has been built successfully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "处理第 100/127 条记录...\n", "\n", "最佳匹配方法统计:\n", "TF-IDF相似度 89\n", "段落匹配 32\n", "精确匹配(段落+值) 6\n", "Name: count, dtype: int64\n", "\n", "匹配结果统计:\n", "最佳匹配结果:\n", "体格检查-体温 20\n", "主刀医师 15\n", "卡号 14\n", "手术经过 13\n", "既往史-手术外伤史-手术史-手术时间 10\n", "患者姓名 7\n", "患者基本情况 6\n", "现病史-发病日期 6\n", "麻醉方式 5\n", "手术名称 5\n", "Name: count, dtype: int64\n", "\n", "TF-IDF匹配结果:\n", "主刀医师 15\n", "卡号 14\n", "手术经过 13\n", "既往史-手术外伤史-手术史-手术时间 10\n", "患者姓名 7\n", "患者基本情况 6\n", "手术名称 5\n", "麻醉方式 5\n", "参与人员 4\n", "主诉 3\n", "Name: count, dtype: int64\n", "\n", "前5行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "最佳匹配: 患者姓名 (方法: TF-IDF相似度)\n", "相似度分数: 0.5819\n", "TF-IDF匹配: 患者姓名\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "最佳匹配: 患者性别 (方法: TF-IDF相似度)\n", "相似度分数: 0.6637\n", "TF-IDF匹配: 患者性别\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "最佳匹配: 患者年龄 (方法: TF-IDF相似度)\n", "相似度分数: 0.6576\n", "TF-IDF匹配: 患者年龄\n", "--------------------------------------------------\n", "原始字段: 病例特点-主诉-主诉\n", "最佳匹配: 主诉 (方法: TF-IDF相似度)\n", "相似度分数: 0.7628\n", "TF-IDF匹配: 主诉\n", "--------------------------------------------------\n", "原始字段: 病例特点-入院日期-入院日期\n", "最佳匹配: 入院时情况-入院时间 (方法: TF-IDF相似度)\n", "相似度分数: 0.5297\n", "TF-IDF匹配: 入院时情况-入院时间\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " paragraph_names = []\n", " statement_names = []\n", " value_item_names = []\n", " \n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " paragraph_names.append(row['ParagraphName'])\n", " statement_names.append(row['StatementName'])\n", " value_item_names.append(row['ValueItemName'])\n", " \n", " # 定义TF-IDF相似度计算方法\n", " def calculate_tfidf_similarity(query, candidates):\n", " \"\"\"计算TF-IDF相似度,返回最佳匹配和分数\"\"\"\n", " try:\n", " # 对中文文本进行分词\n", " segmented_query = ' '.join(jieba.cut(query))\n", " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", " \n", " # 计算TF-IDF向量\n", " vectorizer = TfidfVectorizer()\n", " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", " \n", " # 计算余弦相似度\n", " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", " \n", " # 找到最佳匹配\n", " best_idx = np.argmax(cosine_sim)\n", " return candidates[best_idx], cosine_sim[best_idx]\n", " except Exception as e:\n", " print(f\"TF-IDF计算失败: {e}\")\n", " return \"未匹配\", 0.0\n", " \n", " # 对每个测试字段进行匹配\n", " tfidf_matches = []\n", " best_matches = []\n", " best_match_methods = []\n", " similarity_scores = []\n", " \n", " print(\"开始匹配注释...\")\n", " for i in range(len(combined_fields)):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n", " \n", " query = combined_fields[i]\n", " paragraph_name = paragraph_names[i]\n", " value_item_name = value_item_names[i]\n", " \n", " # 1. 首先检查是否有注释包含ParagraphName\n", " paragraph_matches = [ann for ann in regular_annotations if paragraph_name in ann]\n", " \n", " if paragraph_matches:\n", " # 2. 如果有包含ParagraphName的注释,再检查是否有同时包含ValueItemName的\n", " value_matches = [ann for ann in paragraph_matches if value_item_name in ann]\n", " if value_matches:\n", " # 找到同时包含ParagraphName和ValueItemName的注释\n", " best_match = value_matches[0] # 取第一个匹配\n", " best_match_method = \"精确匹配(段落+值)\"\n", " similarity_score = 1.0 # 精确匹配给予最高分\n", " else:\n", " # 只找到包含ParagraphName的注释\n", " best_match = paragraph_matches[0] # 取第一个匹配\n", " best_match_method = \"段落匹配\"\n", " similarity_score = 0.8 # 段落匹配给予较高分\n", " else:\n", " # 3. 如果没有包含ParagraphName的注释,使用TF-IDF相似度\n", " best_match, similarity_score = calculate_tfidf_similarity(query, regular_annotations)\n", " best_match_method = \"TF-IDF相似度\"\n", " \n", " # 计算TF-IDF匹配以便比较\n", " tfidf_match, _ = calculate_tfidf_similarity(query, regular_annotations)\n", " tfidf_matches.append(tfidf_match)\n", " \n", " best_matches.append(best_match)\n", " best_match_methods.append(best_match_method)\n", " similarity_scores.append(similarity_score)\n", " \n", " # 获取ValueItemKind列的位置\n", " kind_idx = test_df.columns.get_loc('ValueItemKind')\n", " \n", " # 在ValueItemKind列前插入新的列\n", " test_df.insert(kind_idx, 'Similarity_Score', similarity_scores)\n", " test_df.insert(kind_idx, 'Best_Match_Method', best_match_methods)\n", " test_df.insert(kind_idx, 'Best_Match', best_matches)\n", " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n", " \n", " # 保存结果\n", " test_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_with_tfidf.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " # 打印匹配结果统计\n", " print(\"\\n最佳匹配方法统计:\")\n", " print(pd.Series(best_match_methods).value_counts())\n", " \n", " print(\"\\n匹配结果统计:\")\n", " print(\"最佳匹配结果:\")\n", " print(pd.Series(best_matches).value_counts().head(10))\n", " print(\"\\nTF-IDF匹配结果:\")\n", " print(pd.Series(tfidf_matches).value_counts().head(10))\n", " \n", " # 打印前5行匹配结果示例\n", " print(\"\\n前5行匹配结果示例:\")\n", " for i in range(min(5, len(test_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"最佳匹配: {test_df.iloc[i]['Best_Match']} (方法: {test_df.iloc[i]['Best_Match_Method']})\")\n", " print(f\"相似度分数: {test_df.iloc[i]['Similarity_Score']:.4f}\")\n", " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n", "处理第 100/127 条记录...\n", "\n", "匹配完成,共处理 127 条记录\n", "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\n", "\n", "前3行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "匹配1: 患者姓名 (分数: 0.5819)\n", "匹配2: 患者姓名 (分数: 0.5819)\n", "匹配3: 姓名 (分数: 0.5574)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "匹配1: 患者性别 (分数: 0.6637)\n", "匹配2: 性别 (分数: 0.6416)\n", "匹配3: 性别 (分数: 0.6416)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "匹配1: 患者年龄 (分数: 0.6576)\n", "匹配2: 年龄 (分数: 0.6348)\n", "匹配3: 年龄 (分数: 0.6348)\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " \n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 定义TF-IDF相似度计算方法,返回前3个最佳匹配\n", " def calculate_top3_tfidf_similarity(query, candidates):\n", " \"\"\"计算TF-IDF相似度,返回前3个最佳匹配和分数\"\"\"\n", " try:\n", " # 对中文文本进行分词\n", " segmented_query = ' '.join(jieba.cut(query))\n", " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n", " \n", " # 计算TF-IDF向量\n", " vectorizer = TfidfVectorizer()\n", " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n", " \n", " # 计算余弦相似度\n", " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n", " \n", " # 找到前3个最佳匹配\n", " top3_indices = np.argsort(cosine_sim)[::-1][:3]\n", " top3_scores = cosine_sim[top3_indices]\n", " \n", " return top3_indices, top3_scores\n", " except Exception as e:\n", " print(f\"TF-IDF计算失败: {e}\")\n", " return [-1, -1, -1], [0.0, 0.0, 0.0]\n", " \n", " # 创建结果DataFrame\n", " result_data = []\n", " \n", " print(\"开始匹配注释...\")\n", " for i in range(len(test_df)):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", " \n", " query = combined_fields[i]\n", " \n", " # 使用TF-IDF相似度匹配,获取前3个最佳匹配\n", " top3_indices, top3_scores = calculate_top3_tfidf_similarity(query, regular_annotations)\n", " \n", " # 获取测试数据的相关字段\n", " paragraph_name = test_df.iloc[i]['ParagraphName']\n", " statement_name = test_df.iloc[i]['StatementName']\n", " value_item_name = test_df.iloc[i]['ValueItemName']\n", " display_string = test_df.iloc[i]['DisplayString']\n", " \n", " # 获取前3个规范数据的相关字段\n", " regular_nodes = []\n", " regular_annotations_matched = []\n", " regular_descriptions = []\n", " \n", " for idx, score in zip(top3_indices, top3_scores):\n", " if idx >= 0:\n", " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n", " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n", " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n", " else:\n", " regular_nodes.append(\"未匹配\")\n", " regular_annotations_matched.append(\"未匹配\")\n", " regular_descriptions.append(\"未匹配\")\n", " \n", " # 确保有3个结果(如果候选项少于3个)\n", " while len(regular_nodes) < 3:\n", " regular_nodes.append(\"未匹配\")\n", " regular_annotations_matched.append(\"未匹配\")\n", " regular_descriptions.append(\"未匹配\")\n", " top3_scores = np.append(top3_scores, 0.0)\n", " \n", " # 添加到结果数据\n", " result_data.append({\n", " 'ParagraphName': paragraph_name,\n", " 'StatementName': statement_name,\n", " 'ValueItemName': value_item_name,\n", " 'DisplayString': display_string,\n", " '规范节点名1': regular_nodes[0],\n", " '规范注释1': regular_annotations_matched[0],\n", " '规范说明1': regular_descriptions[0],\n", " '相似度分数1': top3_scores[0],\n", " '规范节点名2': regular_nodes[1],\n", " '规范注释2': regular_annotations_matched[1],\n", " '规范说明2': regular_descriptions[1],\n", " '相似度分数2': top3_scores[1],\n", " '规范节点名3': regular_nodes[2],\n", " '规范注释3': regular_annotations_matched[2],\n", " '规范说明3': regular_descriptions[2],\n", " '相似度分数3': top3_scores[2]\n", " })\n", " \n", " # 创建结果DataFrame\n", " result_df = pd.DataFrame(result_data)\n", " \n", " # 保存结果\n", " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\")\n", " \n", " # 打印前3行匹配结果示例\n", " print(\"\\n前3行匹配结果示例:\")\n", " for i in range(min(3, len(result_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n", " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n", " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "加载Sentence Transformer模型...\n", "对规范注释进行编码...\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n", "处理第 100/127 条记录...\n", "\n", "匹配完成,共处理 127 条记录\n", "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv\n", "\n", "前3行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "匹配1: 讨论经过-病理科-医师姓名 (分数: 0.8398)\n", "匹配2: 讨论经过-放射科-医师姓名 (分数: 0.8398)\n", "匹配3: 讨论经过-放化疗科-医师姓名 (分数: 0.8384)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "匹配1: 辅助检查-乙肝病毒-医院名称 (分数: 0.8360)\n", "匹配2: 辅助检查-骨扫描检查-医院名称 (分数: 0.8325)\n", "匹配3: 讨论经过-病理科-医师姓名 (分数: 0.8265)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "匹配1: 入院时情况-患者年龄 (分数: 0.8390)\n", "匹配2: 辅助检查-乙肝病毒-医院名称 (分数: 0.8046)\n", "匹配3: 辅助检查-骨扫描检查-医院名称 (分数: 0.8012)\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sentence_transformers import SentenceTransformer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import torch\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " \n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 加载预训练的中文Sentence Transformer模型\n", " print(\"加载Sentence Transformer模型...\")\n", " \n", " # 如果都失败,尝试加载基础模型\n", " model = SentenceTransformer('/home/limeng/SICT/lung_test/all-MiniLM-L6-v2')\n", "\n", " # 定义Sentence Transformer相似度计算方法,返回前3个最佳匹配\n", " def calculate_top3_transformer_similarity(query, candidates, model):\n", " \"\"\"计算Sentence Transformer相似度,返回前3个最佳匹配和分数\"\"\"\n", " try:\n", " # 编码查询和候选项\n", " query_embedding = model.encode([query], convert_to_tensor=True)\n", " candidate_embeddings = model.encode(candidates, convert_to_tensor=True)\n", " \n", " # 计算余弦相似度\n", " cosine_scores = cosine_similarity(\n", " query_embedding.cpu().numpy(), \n", " candidate_embeddings.cpu().numpy()\n", " )[0]\n", " \n", " # 找到前3个最佳匹配\n", " top3_indices = np.argsort(cosine_scores)[::-1][:3]\n", " top3_scores = cosine_scores[top3_indices]\n", " \n", " return top3_indices, top3_scores\n", " except Exception as e:\n", " print(f\"Transformer相似度计算失败: {e}\")\n", " return [-1, -1, -1], [0.0, 0.0, 0.0]\n", " \n", " # 创建结果DataFrame\n", " result_data = []\n", " \n", " # 先对规范注释进行编码,避免重复计算\n", " print(\"对规范注释进行编码...\")\n", " \n", " # 创建结果DataFrame\n", " result_data = []\n", " \n", " print(\"开始匹配注释...\")\n", " for i in range(len(test_df)):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", " \n", " query = combined_fields[i]\n", " \n", " # 使用Sentence Transformer相似度匹配,获取前3个最佳匹配\n", " top3_indices, top3_scores = calculate_top3_transformer_similarity(query, regular_annotations, model)\n", " \n", " # 获取测试数据的相关字段\n", " paragraph_name = test_df.iloc[i]['ParagraphName']\n", " statement_name = test_df.iloc[i]['StatementName']\n", " value_item_name = test_df.iloc[i]['ValueItemName']\n", " display_string = test_df.iloc[i]['DisplayString']\n", " \n", " # 获取前3个规范数据的相关字段\n", " regular_nodes = []\n", " regular_annotations_matched = []\n", " regular_descriptions = []\n", " \n", " for idx, score in zip(top3_indices, top3_scores):\n", " if idx >= 0:\n", " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n", " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n", " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n", " else:\n", " regular_nodes.append(\"未匹配\")\n", " regular_annotations_matched.append(\"未匹配\")\n", " regular_descriptions.append(\"未匹配\")\n", " \n", " # 确保有3个结果(如果候选项少于3个)\n", " while len(regular_nodes) < 3:\n", " regular_nodes.append(\"未匹配\")\n", " regular_annotations_matched.append(\"未匹配\")\n", " regular_descriptions.append(\"未匹配\")\n", " top3_scores = np.append(top3_scores, 0.0)\n", " \n", " # 添加到结果数据\n", " result_data.append({\n", " 'ParagraphName': paragraph_name,\n", " 'StatementName': statement_name,\n", " 'ValueItemName': value_item_name,\n", " 'DisplayString': display_string,\n", " '规范节点名1': regular_nodes[0],\n", " '规范注释1': regular_annotations_matched[0],\n", " '规范说明1': regular_descriptions[0],\n", " '相似度分数1': top3_scores[0],\n", " '规范节点名2': regular_nodes[1],\n", " '规范注释2': regular_annotations_matched[1],\n", " '规范说明2': regular_descriptions[1],\n", " '相似度分数2': top3_scores[1],\n", " '规范节点名3': regular_nodes[2],\n", " '规范注释3': regular_annotations_matched[2],\n", " '规范说明3': regular_descriptions[2],\n", " '相似度分数3': top3_scores[2]\n", " })\n", " \n", " # 创建结果DataFrame\n", " result_df = pd.DataFrame(result_data)\n", " \n", " # 保存结果\n", " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv\")\n", " \n", " # 打印前3行匹配结果示例\n", " print(\"\\n前3行匹配结果示例:\")\n", " for i in range(min(3, len(result_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n", " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n", " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "初始化TF-IDF向量化器...\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n", "Building prefix dict from the default dictionary ...\n", "Dumping model to file cache /tmp/jieba.cache\n", "Loading model cost 0.934 seconds.\n", "Prefix dict has been built successfully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "处理第 100/127 条记录...\n", "\n", "匹配完成,共处理 127 条记录\n", "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\n", "\n", "前3行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "匹配1: 患者姓名 (分数: 0.4840)\n", "匹配2: 患者姓名 (分数: 0.4840)\n", "匹配3: 姓名 (分数: 0.4637)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "匹配1: 患者性别 (分数: 0.5414)\n", "匹配2: 性别 (分数: 0.5235)\n", "匹配3: 性别 (分数: 0.5235)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "匹配1: 患者年龄 (分数: 0.5357)\n", "匹配2: 年龄 (分数: 0.5170)\n", "匹配3: 年龄 (分数: 0.5170)\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " \n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 使用TF-IDF向量化文本\n", " print(\"初始化TF-IDF向量化器...\")\n", " \n", " # 对中文文本进行分词处理\n", " def tokenize_chinese(text):\n", " return list(jieba.cut(text))\n", " \n", " # 初始化TF-IDF向量化器\n", " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n", " \n", " # 定义TF-IDF相似度计算方法,返回前3个最佳匹配\n", " def calculate_top3_tfidf_similarity(query, candidates, vectorizer):\n", " \"\"\"计算TF-IDF相似度,返回前3个最佳匹配和分数\"\"\"\n", " try:\n", " # 将所有文本合并为一个列表进行向量化\n", " all_texts = [query] + candidates\n", " \n", " # 拟合并转换所有文本\n", " tfidf_matrix = vectorizer.fit_transform(all_texts)\n", " \n", " # 计算查询与所有候选项的余弦相似度\n", " query_vector = tfidf_matrix[0:1]\n", " candidate_vectors = tfidf_matrix[1:]\n", " \n", " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n", " \n", " # 找到前3个最佳匹配\n", " top3_indices = np.argsort(cosine_scores)[::-1][:3]\n", " top3_scores = cosine_scores[top3_indices]\n", " \n", " return top3_indices, top3_scores\n", " except Exception as e:\n", " print(f\"TF-IDF相似度计算失败: {e}\")\n", " return [-1, -1, -1], [0.0, 0.0, 0.0]\n", " \n", " # 创建结果DataFrame\n", " result_data = []\n", " \n", " print(\"开始匹配注释...\")\n", " for i in range(len(test_df)):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", " \n", " query = combined_fields[i]\n", " \n", " # 使用TF-IDF相似度匹配,获取前3个最佳匹配\n", " top3_indices, top3_scores = calculate_top3_tfidf_similarity(query, regular_annotations, tfidf_vectorizer)\n", " \n", " # 获取测试数据的相关字段\n", " paragraph_name = test_df.iloc[i]['ParagraphName']\n", " statement_name = test_df.iloc[i]['StatementName']\n", " value_item_name = test_df.iloc[i]['ValueItemName']\n", " display_string = test_df.iloc[i]['DisplayString']\n", " \n", " # 获取前3个规范数据的相关字段\n", " regular_nodes = []\n", " regular_annotations_matched = []\n", " regular_descriptions = []\n", " \n", " for idx, score in zip(top3_indices, top3_scores):\n", " if idx >= 0:\n", " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n", " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n", " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n", " else:\n", " regular_nodes.append(\"未匹配\")\n", " regular_annotations_matched.append(\"未匹配\")\n", " regular_descriptions.append(\"未匹配\")\n", " \n", " # 确保有3个结果(如果候选项少于3个)\n", " while len(regular_nodes) < 3:\n", " regular_nodes.append(\"未匹配\")\n", " regular_annotations_matched.append(\"未匹配\")\n", " regular_descriptions.append(\"未匹配\")\n", " top3_scores = np.append(top3_scores, 0.0)\n", " \n", " # 添加到结果数据\n", " result_data.append({\n", " 'ParagraphName': paragraph_name,\n", " 'StatementName': statement_name,\n", " 'ValueItemName': value_item_name,\n", " 'DisplayString': display_string,\n", " '规范节点名1': regular_nodes[0],\n", " '规范注释1': regular_annotations_matched[0],\n", " '规范说明1': regular_descriptions[0],\n", " '相似度分数1': top3_scores[0],\n", " '规范节点名2': regular_nodes[1],\n", " '规范注释2': regular_annotations_matched[1],\n", " '规范说明2': regular_descriptions[1],\n", " '相似度分数2': top3_scores[1],\n", " '规范节点名3': regular_nodes[2],\n", " '规范注释3': regular_annotations_matched[2],\n", " '规范说明3': regular_descriptions[2],\n", " '相似度分数3': top3_scores[2]\n", " })\n", " \n", " # 创建结果DataFrame\n", " result_df = pd.DataFrame(result_data)\n", " \n", " # 保存结果\n", " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\")\n", " \n", " # 打印前3行匹配结果示例\n", " print(\"\\n前3行匹配结果示例:\")\n", " for i in range(min(3, len(result_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n", " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n", " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "初始化TF-IDF向量化器...\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "处理第 100/127 条记录...\n", "\n", "匹配完成,共处理 127 条记录\n", "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv\n", "\n", "前3行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "TF-IDF匹配结果:\n", " 匹配1: 患者姓名 (分数: 0.4840)\n", " 匹配2: 患者姓名 (分数: 0.4840)\n", " 匹配3: 姓名 (分数: 0.4637)\n", "FuzzyWuzzy匹配结果:\n", " 匹配1: 患者姓名 (分数: 0.3300)\n", " 匹配2: 入院时情况-患者姓名 (分数: 0.3300)\n", " 匹配3: 患者姓名 (分数: 0.3300)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "TF-IDF匹配结果:\n", " 匹配1: 患者性别 (分数: 0.5414)\n", " 匹配2: 性别 (分数: 0.5235)\n", " 匹配3: 性别 (分数: 0.5235)\n", "FuzzyWuzzy匹配结果:\n", " 匹配1: 患者姓名 (分数: 0.3600)\n", " 匹配2: 入院时情况-患者姓名 (分数: 0.3600)\n", " 匹配3: 患者姓名 (分数: 0.3600)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "TF-IDF匹配结果:\n", " 匹配1: 患者年龄 (分数: 0.5357)\n", " 匹配2: 年龄 (分数: 0.5170)\n", " 匹配3: 年龄 (分数: 0.5170)\n", "FuzzyWuzzy匹配结果:\n", " 匹配1: 患者姓名 (分数: 0.3600)\n", " 匹配2: 入院时情况-患者姓名 (分数: 0.3600)\n", " 匹配3: 患者姓名 (分数: 0.3600)\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "from fuzzywuzzy import fuzz\n", "from fuzzywuzzy import process\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " \n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 使用TF-IDF向量化文本\n", " print(\"初始化TF-IDF向量化器...\")\n", " \n", " # 对中文文本进行分词处理\n", " def tokenize_chinese(text):\n", " return list(jieba.cut(text))\n", " \n", " # 初始化TF-IDF向量化器\n", " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n", " \n", " # 定义TF-IDF相似度计算方法,返回最佳匹配(最多3个)\n", " def calculate_tfidf_similarity(query, candidates, vectorizer, max_matches=3, threshold=0.1):\n", " \"\"\"计算TF-IDF相似度,返回最佳匹配(最多max_matches个)\"\"\"\n", " try:\n", " # 将所有文本合并为一个列表进行向量化\n", " all_texts = [query] + candidates\n", " \n", " # 拟合并转换所有文本\n", " tfidf_matrix = vectorizer.fit_transform(all_texts)\n", " \n", " # 计算查询与所有候选项的余弦相似度\n", " query_vector = tfidf_matrix[0:1]\n", " candidate_vectors = tfidf_matrix[1:]\n", " \n", " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n", " \n", " # 找到相似度大于阈值的匹配\n", " valid_indices = np.where(cosine_scores > threshold)[0]\n", " \n", " # 按相似度降序排序\n", " sorted_indices = valid_indices[np.argsort(cosine_scores[valid_indices])[::-1]]\n", " \n", " # 最多取max_matches个\n", " top_indices = sorted_indices[:max_matches]\n", " top_scores = cosine_scores[top_indices]\n", " \n", " return top_indices, top_scores\n", " except Exception as e:\n", " print(f\"TF-IDF相似度计算失败: {e}\")\n", " return np.array([]), np.array([])\n", " \n", " # 定义FuzzyWuzzy相似度计算方法,返回最佳匹配(最多3个)\n", " def calculate_fuzzy_similarity(query, candidates, max_matches=3):\n", " \"\"\"计算FuzzyWuzzy相似度,返回最佳匹配(最多max_matches个)\"\"\"\n", " try:\n", " # 使用process.extract获取最佳匹配\n", " matches = process.extract(query, candidates, limit=max_matches, scorer=fuzz.token_sort_ratio)\n", " \n", " # 提取索引和分数\n", " indices = []\n", " scores = []\n", " \n", " for match in matches:\n", " # match格式为(匹配文本, 分数)\n", " matched_text, score = match\n", " # 找到匹配文本在原始列表中的索引\n", " idx = candidates.index(matched_text)\n", " indices.append(idx)\n", " scores.append(score / 100.0) # 将分数归一化到0-1范围\n", " \n", " return np.array(indices), np.array(scores)\n", " except Exception as e:\n", " print(f\"FuzzyWuzzy相似度计算失败: {e}\")\n", " return np.array([]), np.array([])\n", " \n", " # 创建结果DataFrame\n", " result_data = []\n", " \n", " print(\"开始匹配注释...\")\n", " \n", " for i in range(len(test_df)):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", " \n", " query = combined_fields[i]\n", " \n", " # 使用TF-IDF相似度匹配,获取最佳匹配(最多3个)\n", " tfidf_indices, tfidf_scores = calculate_tfidf_similarity(query, regular_annotations, tfidf_vectorizer)\n", " \n", " # 使用FuzzyWuzzy相似度匹配,获取最佳匹配(最多3个)\n", " fuzzy_indices, fuzzy_scores = calculate_fuzzy_similarity(query, regular_annotations)\n", " \n", " # 获取测试数据的相关字段\n", " paragraph_name = test_df.iloc[i]['ParagraphName']\n", " statement_name = test_df.iloc[i]['StatementName']\n", " value_item_name = test_df.iloc[i]['ValueItemName']\n", " display_string = test_df.iloc[i]['DisplayString']\n", " \n", " # 创建结果字典\n", " result_dict = {\n", " 'ParagraphName': paragraph_name,\n", " 'StatementName': statement_name,\n", " 'ValueItemName': value_item_name,\n", " 'DisplayString': display_string\n", " }\n", " \n", " # 添加TF-IDF匹配结果\n", " for j in range(min(3, len(tfidf_indices))):\n", " idx = tfidf_indices[j]\n", " score = tfidf_scores[j]\n", " result_dict[f'TFIDF_规范节点名{j+1}'] = regular_df.iloc[idx]['节点名']\n", " result_dict[f'TFIDF_规范注释{j+1}'] = regular_df.iloc[idx]['注释']\n", " result_dict[f'TFIDF_规范说明{j+1}'] = regular_df.iloc[idx]['说明']\n", " result_dict[f'TFIDF_相似度分数{j+1}'] = score\n", " \n", " # 添加FuzzyWuzzy匹配结果\n", " for j in range(min(3, len(fuzzy_indices))):\n", " idx = fuzzy_indices[j]\n", " score = fuzzy_scores[j]\n", " result_dict[f'Fuzzy_规范节点名{j+1}'] = regular_df.iloc[idx]['节点名']\n", " result_dict[f'Fuzzy_规范注释{j+1}'] = regular_df.iloc[idx]['注释']\n", " result_dict[f'Fuzzy_规范说明{j+1}'] = regular_df.iloc[idx]['说明']\n", " result_dict[f'Fuzzy_相似度分数{j+1}'] = score\n", " \n", " # 添加到结果数据\n", " result_data.append(result_dict)\n", " \n", " # 创建结果DataFrame\n", " result_df = pd.DataFrame(result_data)\n", " \n", " # 保存结果\n", " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv\")\n", " \n", " # 打印前3行匹配结果示例\n", " print(\"\\n前3行匹配结果示例:\")\n", " for i in range(min(3, len(result_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " \n", " print(\"TF-IDF匹配结果:\")\n", " for j in range(1, 4):\n", " if f'TFIDF_规范注释{j}' in result_df.columns and not pd.isna(result_df.iloc[i].get(f'TFIDF_规范注释{j}', None)):\n", " print(f\" 匹配{j}: {result_df.iloc[i][f'TFIDF_规范注释{j}']} (分数: {result_df.iloc[i][f'TFIDF_相似度分数{j}']:.4f})\")\n", " \n", " print(\"FuzzyWuzzy匹配结果:\")\n", " for j in range(1, 4):\n", " if f'Fuzzy_规范注释{j}' in result_df.columns and not pd.isna(result_df.iloc[i].get(f'Fuzzy_规范注释{j}', None)):\n", " print(f\" 匹配{j}: {result_df.iloc[i][f'Fuzzy_规范注释{j}']} (分数: {result_df.iloc[i][f'Fuzzy_相似度分数{j}']:.4f})\")\n", " \n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n", "Building prefix dict from the default dictionary ...\n", "Loading model from cache /tmp/jieba.cache\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "初始化TF-IDF向量化器...\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loading model cost 0.942 seconds.\n", "Prefix dict has been built successfully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "处理第 100/127 条记录...\n", "\n", "匹配完成,共处理 127 条记录\n", "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\n", "\n", "前3行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "最佳匹配: 患者姓名 (分数: 0.4840)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "最佳匹配: 患者性别 (分数: 0.5414)\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "最佳匹配: 患者年龄 (分数: 0.5357)\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " \n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 使用TF-IDF向量化文本\n", " print(\"初始化TF-IDF向量化器...\")\n", " \n", " # 对中文文本进行分词处理\n", " def tokenize_chinese(text):\n", " return list(jieba.cut(text))\n", " \n", " # 初始化TF-IDF向量化器\n", " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n", " \n", " # 定义TF-IDF相似度计算方法,只返回最佳匹配(分数最高的)\n", " def calculate_best_tfidf_match(query, candidates, vectorizer):\n", " \"\"\"计算TF-IDF相似度,只返回最佳匹配(分数最高的)\"\"\"\n", " try:\n", " # 将所有文本合并为一个列表进行向量化\n", " all_texts = [query] + candidates\n", " \n", " # 拟合并转换所有文本\n", " tfidf_matrix = vectorizer.fit_transform(all_texts)\n", " \n", " # 计算查询与所有候选项的余弦相似度\n", " query_vector = tfidf_matrix[0:1]\n", " candidate_vectors = tfidf_matrix[1:]\n", " \n", " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n", " \n", " # 找到分数最高的匹配\n", " best_index = np.argmax(cosine_scores)\n", " best_score = cosine_scores[best_index]\n", " \n", " return best_index, best_score\n", " except Exception as e:\n", " print(f\"TF-IDF相似度计算失败: {e}\")\n", " return -1, 0.0\n", " \n", " # 创建结果DataFrame\n", " result_data = []\n", " \n", " print(\"开始匹配注释...\")\n", " \n", " for i in range(len(test_df)):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", " \n", " query = combined_fields[i]\n", " \n", " # 使用TF-IDF相似度匹配,只获取最佳匹配\n", " best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n", " \n", " # 获取测试数据的相关字段\n", " row = test_df.iloc[i]\n", " \n", " # 创建结果字典,包含原始字段\n", " result_dict = {\n", " 'ParagraphName': row['ParagraphName'],\n", " 'StatementName': row['StatementName'],\n", " 'ValueItemName': row['ValueItemName'],\n", " 'DisplayString': row['DisplayString']\n", " }\n", " \n", " # 添加SFZH, XGRQ, IPBLH字段(如果存在)\n", " if 'SFZH' in test_df.columns:\n", " result_dict['SFZH'] = row['SFZH']\n", " if 'XGRQ' in test_df.columns:\n", " result_dict['XGRQ'] = row['XGRQ']\n", " if 'IPBLH' in test_df.columns:\n", " result_dict['IPBLH'] = row['IPBLH']\n", " \n", " # 添加最佳TF-IDF匹配结果\n", " if best_index >= 0:\n", " result_dict['TFIDF_规范节点名'] = regular_df.iloc[best_index]['节点名']\n", " result_dict['TFIDF_规范注释'] = regular_df.iloc[best_index]['注释']\n", " result_dict['TFIDF_规范说明'] = regular_df.iloc[best_index]['说明']\n", " result_dict['TFIDF_相似度分数'] = best_score\n", " else:\n", " result_dict['TFIDF_规范节点名'] = ''\n", " result_dict['TFIDF_规范注释'] = ''\n", " result_dict['TFIDF_规范说明'] = ''\n", " result_dict['TFIDF_相似度分数'] = 0.0\n", " \n", " # 添加到结果数据\n", " result_data.append(result_dict)\n", " \n", " # 创建结果DataFrame\n", " result_df = pd.DataFrame(result_data)\n", " \n", " # 保存结果\n", " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\")\n", " \n", " # 打印前3行匹配结果示例\n", " print(\"\\n前3行匹配结果示例:\")\n", " for i in range(min(3, len(result_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"最佳匹配: {result_df.iloc[i]['TFIDF_规范注释']} (分数: {result_df.iloc[i]['TFIDF_相似度分数']:.4f})\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n", "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n", "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n", "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n", "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n", "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n", "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00= 0:\n", " result_dict['TFIDF_规范节点名'] = regular_df.iloc[best_index]['节点名']\n", " result_dict['TFIDF_规范注释'] = regular_df.iloc[best_index]['注释']\n", " result_dict['TFIDF_规范说明'] = regular_df.iloc[best_index]['说明']\n", " result_dict['TFIDF_相似度分数'] = best_score\n", " else:\n", " result_dict['TFIDF_规范节点名'] = ''\n", " result_dict['TFIDF_规范注释'] = ''\n", " result_dict['TFIDF_规范说明'] = ''\n", " result_dict['TFIDF_相似度分数'] = 0.0\n", " \n", " # 添加到结果数据\n", " result_data.append(result_dict)\n", " \n", " # 创建结果DataFrame\n", " result_df = pd.DataFrame(result_data)\n", " \n", " # 保存结果\n", " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv', \n", " index=False, encoding=test_encoding)\n", " \n", " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\")\n", " \n", " # 打印前3行匹配结果示例\n", " print(\"\\n前3行匹配结果示例:\")\n", " for i in range(min(3, len(result_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"最佳匹配: {result_df.iloc[i]['TFIDF_规范注释']} (分数: {result_df.iloc[i]['TFIDF_相似度分数']:.4f})\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "测试文件编码: GB2312\n", "规范文件编码: utf-8\n", "文件成功读取!\n", "\n", "测试文件的列名:\n", "['Id', 'PatientName', 'IPBLH', 'OPBLH', 'KH', 'KLX', 'SexId', 'CSRQ', 'ZJLX', 'SFZH', 'HYZK', 'ZYDM', 'GJDM', 'MZDM', 'JZDZ', 'YB', 'JG', 'HKDZ', 'DHHM', 'SJHM', 'ABOBloodTypeId', 'LXRXM', 'LXRGX', 'LXRDH', 'ZLLB', 'ZLMC', 'XGRQ', 'YJLXH', 'RYSJ', 'FolderName', 'Xh', 'RecordXh', 'FolderId', 'DocumentName', 'InstanceId', 'DocumentId', 'ParagraphId', 'ParagraphName', 'StatementId', 'StatementName', 'ValueId', 'ValueItemName', 'ValueItemKind', 'RealValue', 'ValueString', 'DisplayString', 'ValuePostfix', 'WSJLSCSJ', 'WSJLXGSJ', 'upload_time']\n", "\n", "初始化TF-IDF向量化器...\n", "开始匹配注释...\n", "处理第 0/127 条记录...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "处理第 100/127 条记录...\n", "\n", "匹配完成,共处理 127 条记录\n", "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_完整字段_tfidf_match.csv\n", "\n", "结果文件的列名:\n", "['Id', 'PatientName', 'IPBLH', 'OPBLH', 'KH', 'KLX', 'SexId', 'CSRQ', 'ZJLX', 'SFZH', 'HYZK', 'ZYDM', 'GJDM', 'MZDM', 'JZDZ', 'YB', 'JG', 'HKDZ', 'DHHM', 'SJHM', 'ABOBloodTypeId', 'LXRXM', 'LXRGX', 'LXRDH', 'ZLLB', 'ZLMC', 'XGRQ', 'YJLXH', 'RYSJ', 'FolderName', 'Xh', 'RecordXh', 'FolderId', 'DocumentName', 'InstanceId', 'DocumentId', 'ParagraphId', 'ParagraphName', 'StatementId', 'StatementName', 'ValueId', 'ValueItemName', 'ValueItemKind', 'RealValue', 'ValueString', 'DisplayString', 'ValuePostfix', 'WSJLSCSJ', 'WSJLXGSJ', 'upload_time', '规范节点名', '规范注释', '规范说明', 'processed_string']\n", "\n", "前3行匹配结果示例:\n", "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n", "最佳匹配: 患者姓名\n", "处理后字符串: 测试\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-性别\n", "最佳匹配: 患者性别\n", "处理后字符串: 女\n", "--------------------------------------------------\n", "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n", "最佳匹配: 患者年龄\n", "处理后字符串: 22岁\n", "--------------------------------------------------\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import chardet\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import jieba\n", "\n", "# 首先检测文件的实际编码\n", "def detect_encoding(file_path):\n", " with open(file_path, 'rb') as f:\n", " result = chardet.detect(f.read())\n", " return result['encoding']\n", "\n", "# 检测文件编码\n", "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n", "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n", "\n", "test_encoding = detect_encoding(test_file)\n", "regular_encoding = detect_encoding(regular_file)\n", "\n", "print(f\"测试文件编码: {test_encoding}\")\n", "print(f\"规范文件编码: {regular_encoding}\")\n", "\n", "# 尝试使用检测到的编码读取文件\n", "try:\n", " # 读取规范文件\n", " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n", " \n", " # 读取测试数据\n", " test_df = pd.read_csv(test_file, encoding=test_encoding)\n", " \n", " print(\"文件成功读取!\")\n", "except Exception as e:\n", " print(f\"使用检测到的编码读取失败: {e}\")\n", " \n", " # 尝试其他常见编码\n", " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n", " \n", " for enc in encodings:\n", " try:\n", " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n", " test_df = pd.read_csv(test_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取测试文件\")\n", " \n", " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n", " regular_df = pd.read_csv(regular_file, encoding=enc)\n", " print(f\"成功使用 {enc} 读取规范文件\")\n", " \n", " test_encoding = enc\n", " regular_encoding = enc\n", " break\n", " except Exception as e:\n", " print(f\"使用 {enc} 读取失败: {e}\")\n", "\n", "# 如果成功读取文件,继续处理\n", "if 'test_df' in locals() and 'regular_df' in locals():\n", " # 打印测试文件的列名,以供参考\n", " print(\"\\n测试文件的列名:\")\n", " print(test_df.columns.tolist())\n", " \n", " # 创建规范字典,键为注释,值为对应的规则\n", " regular_annotations = regular_df['注释'].tolist()\n", " \n", " # 准备测试数据中的字段组合\n", " combined_fields = []\n", " \n", " for _, row in test_df.iterrows():\n", " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n", " combined_fields.append(combined_field)\n", " \n", " # 使用TF-IDF向量化文本\n", " print(\"\\n初始化TF-IDF向量化器...\")\n", " \n", " # 对中文文本进行分词处理\n", " def tokenize_chinese(text):\n", " return list(jieba.cut(text))\n", " \n", " # 初始化TF-IDF向量化器\n", " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n", " \n", " # 定义TF-IDF相似度计算方法,只返回最佳匹配(分数最高的)\n", " def calculate_best_tfidf_match(query, candidates, vectorizer):\n", " \"\"\"计算TF-IDF相似度,只返回最佳匹配(分数最高的)\"\"\"\n", " try:\n", " # 将所有文本合并为一个列表进行向量化\n", " all_texts = [query] + candidates\n", " \n", " # 拟合并转换所有文本\n", " tfidf_matrix = vectorizer.fit_transform(all_texts)\n", " \n", " # 计算查询与所有候选项的余弦相似度\n", " query_vector = tfidf_matrix[0:1]\n", " candidate_vectors = tfidf_matrix[1:]\n", " \n", " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n", " \n", " # 找到分数最高的匹配\n", " best_index = np.argmax(cosine_scores)\n", " best_score = cosine_scores[best_index]\n", " \n", " return best_index, best_score\n", " except Exception as e:\n", " print(f\"TF-IDF相似度计算失败: {e}\")\n", " return -1, 0.0\n", " \n", " # 创建结果DataFrame\n", " result_data = []\n", " \n", " print(\"开始匹配注释...\")\n", " \n", " for i in range(len(test_df)):\n", " if i % 100 == 0:\n", " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n", " \n", " query = combined_fields[i]\n", " \n", " # 使用TF-IDF相似度匹配,只获取最佳匹配\n", " best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n", " \n", " # 获取测试数据的行\n", " row = test_df.iloc[i]\n", " \n", " # 创建结果字典,包含测试数据的所有字段\n", " result_dict = row.to_dict()\n", " \n", " # 添加最佳TF-IDF匹配结果\n", " if best_index >= 0:\n", " result_dict['规范节点名'] = regular_df.iloc[best_index]['节点名']\n", " result_dict['规范注释'] = regular_df.iloc[best_index]['注释']\n", " result_dict['规范说明'] = regular_df.iloc[best_index]['说明']\n", " \n", " \n", " # 从最佳匹配中提取processed_string值\n", " # 如果需要对DisplayString进行处理,可以在此添加逻辑\n", " result_dict['processed_string'] = row['DisplayString']\n", " else:\n", " result_dict['规范节点名'] = ''\n", " result_dict['规范注释'] = ''\n", " result_dict['规范说明'] = ''\n", " \n", " result_dict['processed_string'] = ''\n", " \n", " # 添加到结果数据\n", " result_data.append(result_dict)\n", " \n", " # 创建结果DataFrame\n", " result_df = pd.DataFrame(result_data)\n", " \n", " # 重新排列列顺序,将匹配结果放在后面\n", " all_columns = test_df.columns.tolist() + ['规范节点名', '规范注释', '规范说明', 'processed_string']\n", " result_df = result_df[all_columns]\n", " \n", " # 保存结果\n", " result_file = '/home/limeng/SICT/lung_test/result/喉癌患者测试样例_完整字段_tfidf_match.csv'\n", " result_df.to_csv(result_file, index=False, encoding=test_encoding)\n", " \n", " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n", " print(f\"结果已保存至: {result_file}\")\n", " \n", " # 打印结果DataFrame的列名\n", " print(\"\\n结果文件的列名:\")\n", " print(result_df.columns.tolist())\n", " \n", " # 打印前3行匹配结果示例\n", " print(\"\\n前3行匹配结果示例:\")\n", " for i in range(min(3, len(result_df))):\n", " print(f\"原始字段: {combined_fields[i]}\")\n", " print(f\"最佳匹配: {result_df.iloc[i]['规范注释']}\")\n", " print(f\"处理后字符串: {result_df.iloc[i]['processed_string']}\")\n", " print(\"-\" * 50)\n", "else:\n", " print(\"无法读取文件,请手动检查文件编码\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n", "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n", "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n", "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n", "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n", "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n", "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00