From de2f24da4bf2ff915b2cfa14ed4ace6965ffcb49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E7=9B=9F?= <1127928805@qq.com>
Date: Thu, 27 Mar 2025 09:47:59 +0000
Subject: [PATCH] Upload New File
---
xslx2csv.ipynb | 3591 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 3591 insertions(+)
create mode 100644 xslx2csv.ipynb
diff --git a/xslx2csv.ipynb b/xslx2csv.ipynb
new file mode 100644
index 0000000..b130007
--- /dev/null
+++ b/xslx2csv.ipynb
@@ -0,0 +1,3591 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 节点名 | \n",
+ " 表名 | \n",
+ " 节点类型 | \n",
+ " 宽度 | \n",
+ " 是否必传 | \n",
+ " 注释 | \n",
+ " 说明 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " COC_HBZL_RYJL$kh | \n",
+ " 入院记录 | \n",
+ " 字符 | \n",
+ " 32 | \n",
+ " Y | \n",
+ " 卡号 | \n",
+ " 患者就诊卡卡号 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " COC_HBZL_RYJL$klx | \n",
+ " 入院记录 | \n",
+ " 字符 | \n",
+ " 16 | \n",
+ " Y | \n",
+ " 卡类型 | \n",
+ " 参见字典表 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " COC_HBZL_RYJL$xgbz | \n",
+ " 入院记录 | \n",
+ " 字符 | \n",
+ " 1 | \n",
+ " Y | \n",
+ " 修改标志 | \n",
+ " 1:正常 2:修改3:撤销 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " COC_HBZL_RYJL$yjlxh | \n",
+ " 入院记录 | \n",
+ " 字符 | \n",
+ " 32 | \n",
+ " Y | \n",
+ " 原纪录序号 | \n",
+ " 院内唯一标识 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " COC_HBZL_RYJL$hzbh | \n",
+ " 入院记录 | \n",
+ " varchar | \n",
+ " 64 | \n",
+ " Y | \n",
+ " 患者编号 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1137 | \n",
+ " COC_HBZL_SFJL$hgnpg | \n",
+ " 随访记录 | \n",
+ " varchar | \n",
+ " 500 | \n",
+ " NaN | \n",
+ " 喉功能评估 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1138 | \n",
+ " COC_HBZL_SFJL$zhzlsj | \n",
+ " 随访记录 | \n",
+ " varchar | \n",
+ " 20 | \n",
+ " NaN | \n",
+ " 综合治疗时间 | \n",
+ " 【天/月/年】后 | \n",
+ "
\n",
+ " \n",
+ " 1139 | \n",
+ " COC_HBZL_SFJL$zhzlfa | \n",
+ " 随访记录 | \n",
+ " varchar | \n",
+ " 500 | \n",
+ " NaN | \n",
+ " 综合治疗方案 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1140 | \n",
+ " COC_HBZL_SFJL$sxz | \n",
+ " 随访记录 | \n",
+ " varchar | \n",
+ " 50 | \n",
+ " NaN | \n",
+ " 书写者 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1141 | \n",
+ " COC_HBZL_SFJL$cjsj | \n",
+ " 随访记录 | \n",
+ " varchar | \n",
+ " 20 | \n",
+ " NaN | \n",
+ " 创建时间 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1142 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 节点名 表名 节点类型 宽度 是否必传 注释 说明\n",
+ "0 COC_HBZL_RYJL$kh 入院记录 字符 32 Y 卡号 患者就诊卡卡号\n",
+ "1 COC_HBZL_RYJL$klx 入院记录 字符 16 Y 卡类型 参见字典表\n",
+ "2 COC_HBZL_RYJL$xgbz 入院记录 字符 1 Y 修改标志 1:正常 2:修改3:撤销\n",
+ "3 COC_HBZL_RYJL$yjlxh 入院记录 字符 32 Y 原纪录序号 院内唯一标识\n",
+ "4 COC_HBZL_RYJL$hzbh 入院记录 varchar 64 Y 患者编号 NaN\n",
+ "... ... ... ... ... ... ... ...\n",
+ "1137 COC_HBZL_SFJL$hgnpg 随访记录 varchar 500 NaN 喉功能评估 NaN\n",
+ "1138 COC_HBZL_SFJL$zhzlsj 随访记录 varchar 20 NaN 综合治疗时间 【天/月/年】后\n",
+ "1139 COC_HBZL_SFJL$zhzlfa 随访记录 varchar 500 NaN 综合治疗方案 NaN\n",
+ "1140 COC_HBZL_SFJL$sxz 随访记录 varchar 50 NaN 书写者 NaN\n",
+ "1141 COC_HBZL_SFJL$cjsj 随访记录 varchar 20 NaN 创建时间 NaN\n",
+ "\n",
+ "[1142 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_excel('/home/limeng/SICT/lung_test/数据采集接口规范(喉癌).xlsx')\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_csv('regular.csv',index = False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "匹配结果统计:\n",
+ "未匹配 127\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import difflib\n",
+ "\n",
+ "def get_best_match(target, choices):\n",
+ " \"\"\"使用difflib找到最佳匹配\"\"\"\n",
+ " matches = difflib.get_close_matches(target, choices, n=1, cutoff=0.6)\n",
+ " return matches[0] if matches else None\n",
+ "\n",
+ "# 读取规范文件\n",
+ "regular_df = pd.read_csv('/home/limeng/SICT/lung_test/regular.csv')\n",
+ "\n",
+ "# 读取测试数据\n",
+ "test_df = pd.read_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例.csv', encoding='ISO-8859-1')\n",
+ "\n",
+ "# 创建规范字典,键为注释,值为对应的规则\n",
+ "regular_dict = dict(zip(regular_df['注释'], regular_df.to_dict('records')))\n",
+ "\n",
+ "# 创建新的注释列\n",
+ "matched_annotations = []\n",
+ "for _, row in test_df.iterrows():\n",
+ " # 组合三个字段\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " \n",
+ " # 在规范中查找最佳匹配\n",
+ " best_match = get_best_match(combined_field, regular_dict.keys())\n",
+ " matched_annotations.append(best_match if best_match else \"未匹配\")\n",
+ "\n",
+ "# 获取ValueItemKind列的位置\n",
+ "kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
+ "\n",
+ "# 在ValueItemKind列前插入新的注释列\n",
+ "test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n",
+ "\n",
+ "# 保存结果\n",
+ "test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations.csv', index=False)\n",
+ "\n",
+ "# 打印匹配结果统计\n",
+ "print(\"\\n匹配结果统计:\")\n",
+ "print(pd.Series(matched_annotations).value_counts())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "\n",
+ "匹配结果统计:\n",
+ "未匹配 123\n",
+ "现病史-精神状态 1\n",
+ "体格检查-精神状态 1\n",
+ "体格检查-呼吸 1\n",
+ "体格检查-查体 1\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import difflib\n",
+ "import chardet\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " def get_best_match(target, choices):\n",
+ " \"\"\"使用difflib找到最佳匹配\"\"\"\n",
+ " matches = difflib.get_close_matches(target, choices, n=1, cutoff=0.6)\n",
+ " return matches[0] if matches else None\n",
+ "\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_dict = dict(zip(regular_df['注释'], regular_df.to_dict('records')))\n",
+ "\n",
+ " # 创建新的注释列\n",
+ " matched_annotations = []\n",
+ " for _, row in test_df.iterrows():\n",
+ " # 组合三个字段\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " \n",
+ " # 在规范中查找最佳匹配\n",
+ " best_match = get_best_match(combined_field, regular_dict.keys())\n",
+ " matched_annotations.append(best_match if best_match else \"未匹配\")\n",
+ "\n",
+ " # 获取ValueItemKind列的位置\n",
+ " kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
+ "\n",
+ " # 在ValueItemKind列前插入新的注释列\n",
+ " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n",
+ "\n",
+ " # 保存结果\n",
+ " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ "\n",
+ " # 打印匹配结果统计\n",
+ " print(\"\\n匹配结果统计:\")\n",
+ " print(pd.Series(matched_annotations).value_counts())\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "加载Sentence Transformer模型...\n",
+ "计算规范注释的嵌入向量...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Batches: 100%|██████████| 36/36 [00:01<00:00, 24.81it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "开始匹配注释...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Batches: 100%|██████████| 4/4 [00:00<00:00, 113.98it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "匹配结果统计:\n",
+ "未匹配 14\n",
+ "手术类型 10\n",
+ "皮肤黏膜-皮下出血 9\n",
+ "辅助检查-乙肝病毒-医院名称 9\n",
+ "患者基本情况 5\n",
+ "主任医师签名日期 5\n",
+ "手术医师签名 5\n",
+ "下一步治疗方案-具体方案 5\n",
+ "发起科室参与人员-主任医师 5\n",
+ "一般情况-主要症状及体征-姓名 5\n",
+ "一般情况-发育 3\n",
+ "中断放疗-是否 3\n",
+ "个人史-疫源接触史-接触时间 3\n",
+ "甲状腺-左侧甲状腺包块-有无 3\n",
+ "第一助手 2\n",
+ "第二助手 2\n",
+ "出院时情况 2\n",
+ "以下血管结构可见肿瘤包饶-颈总动脉 2\n",
+ "现病史-精神状态 2\n",
+ "入院时情况-主诉 2\n",
+ "病理报告-检查日期 2\n",
+ "入院时情况-患者年龄 2\n",
+ "讨论经过-病理科-医师姓名 2\n",
+ "系统回顾-运动骨骼系统-关节肿痛-时长 2\n",
+ "辅助检查-心电图-检查结论 2\n",
+ "颈部-颈部气管切开-硅胶气管筒 2\n",
+ "既往史-过敏史-药物食物过敏源 1\n",
+ "一般情况-神志 1\n",
+ "现病史-大便 1\n",
+ "系统回顾-泌尿系统-排尿困难 1\n",
+ "系统回顾-神经精神系统-癫痫 1\n",
+ "脊柱四肢-关节活动 1\n",
+ "咽-喉咽-喉咽后壁新生物-形态 1\n",
+ "讨论经过-病理科-病理结果 1\n",
+ "环后区-其他描述 1\n",
+ "系统回顾-泌尿系统-排尿困难-服用药物 1\n",
+ "系统回顾-血液系统-鼻衄史-目前清理 1\n",
+ "记录医师签名日期 1\n",
+ "一般情况-主要症状及体征-主诉 1\n",
+ "喉部增强CT-CT号 1\n",
+ "术前常规化验-化验日期 1\n",
+ "主治医师签名 1\n",
+ "主任医师签名 1\n",
+ "月经史-月经周期 1\n",
+ "术前常规化验-化验单位 1\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "前5行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "匹配注释: 讨论经过-病理科-医师姓名\n",
+ "相似度分数: 0.8398\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "匹配注释: 辅助检查-乙肝病毒-医院名称\n",
+ "相似度分数: 0.8360\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "匹配注释: 入院时情况-患者年龄\n",
+ "相似度分数: 0.8390\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-主诉-主诉\n",
+ "匹配注释: 入院时情况-主诉\n",
+ "相似度分数: 0.9434\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-入院日期-入院日期\n",
+ "匹配注释: 病理报告-检查日期\n",
+ "相似度分数: 0.9574\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sentence_transformers import SentenceTransformer\n",
+ "import chardet\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 加载预训练的中文Sentence Transformer模型\n",
+ " print(\"加载Sentence Transformer模型...\")\n",
+ " model = SentenceTransformer('/home/limeng/SICT/lung_test/all-MiniLM-L6-v2') # 多语言模型,支持中文\n",
+ " \n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 计算规范注释的嵌入向量\n",
+ " print(\"计算规范注释的嵌入向量...\")\n",
+ " regular_embeddings = model.encode(regular_annotations, show_progress_bar=True)\n",
+ " \n",
+ " # 创建新的注释列\n",
+ " matched_annotations = []\n",
+ " matched_scores = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " # 批量处理测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 计算测试数据的嵌入向量\n",
+ " test_embeddings = model.encode(combined_fields, show_progress_bar=True)\n",
+ " \n",
+ " # 计算相似度并找到最佳匹配\n",
+ " for i, test_embedding in enumerate(test_embeddings):\n",
+ " # 计算与所有规范注释的余弦相似度\n",
+ " similarities = np.dot(regular_embeddings, test_embedding) / (\n",
+ " np.linalg.norm(regular_embeddings, axis=1) * np.linalg.norm(test_embedding)\n",
+ " )\n",
+ " \n",
+ " # 找到最佳匹配\n",
+ " best_match_idx = np.argmax(similarities)\n",
+ " best_match_score = similarities[best_match_idx]\n",
+ " \n",
+ " # 如果相似度低于阈值,标记为未匹配\n",
+ " if best_match_score < 0.5: # 可以调整这个阈值\n",
+ " matched_annotations.append(\"未匹配\")\n",
+ " matched_scores.append(0.0)\n",
+ " else:\n",
+ " matched_annotations.append(regular_annotations[best_match_idx])\n",
+ " matched_scores.append(best_match_score)\n",
+ " \n",
+ " # 获取ValueItemKind列的位置\n",
+ " kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
+ " \n",
+ " # 在ValueItemKind列前插入新的注释列和相似度分数列\n",
+ " test_df.insert(kind_idx, 'Matched_Score', matched_scores)\n",
+ " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n",
+ " \n",
+ " # 保存结果\n",
+ " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_transformer.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " # 打印匹配结果统计\n",
+ " print(\"\\n匹配结果统计:\")\n",
+ " print(pd.Series(matched_annotations).value_counts())\n",
+ " \n",
+ " # 打印前5行匹配结果示例\n",
+ " print(\"\\n前5行匹配结果示例:\")\n",
+ " for i in range(min(5, len(test_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"匹配注释: {test_df.iloc[i]['Matched_Annotation']}\")\n",
+ " print(f\"相似度分数: {test_df.iloc[i]['Matched_Score']:.4f}\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Building prefix dict from the default dictionary ...\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Dumping model to file cache /tmp/jieba.cache\n",
+ "Loading model cost 0.792 seconds.\n",
+ "Prefix dict has been built successfully.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "匹配结果统计:\n",
+ "体格检查 20\n",
+ "未匹配 20\n",
+ "手术经过 13\n",
+ "姓名 11\n",
+ "主刀医师 10\n",
+ "现病史 7\n",
+ "性别 5\n",
+ "手术名称 5\n",
+ "小结时间 5\n",
+ "麻醉方式 5\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "匹配方法使用统计:\n",
+ "Partial 100\n",
+ "None 20\n",
+ "Levenshtein 5\n",
+ "TF-IDF 2\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "前5行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "匹配注释: 姓名\n",
+ "相似度分数: 1.0000\n",
+ "匹配方法: Partial\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "匹配注释: 姓名\n",
+ "相似度分数: 1.0000\n",
+ "匹配方法: Partial\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "匹配注释: 姓名\n",
+ "相似度分数: 1.0000\n",
+ "匹配方法: Partial\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-主诉-主诉\n",
+ "匹配注释: 主诉\n",
+ "相似度分数: 1.0000\n",
+ "匹配方法: Partial\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-入院日期-入院日期\n",
+ "匹配注释: 病理报告-检查日期\n",
+ "相似度分数: 0.6774\n",
+ "匹配方法: Levenshtein\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "import Levenshtein\n",
+ "from fuzzywuzzy import fuzz\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 定义多种相似度计算方法\n",
+ " def calculate_similarities(query, candidates):\n",
+ " \"\"\"计算多种相似度指标\"\"\"\n",
+ " results = []\n",
+ " \n",
+ " # 1. TF-IDF + 余弦相似度\n",
+ " try:\n",
+ " # 对中文文本进行分词\n",
+ " segmented_query = ' '.join(jieba.cut(query))\n",
+ " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
+ " \n",
+ " # 计算TF-IDF向量\n",
+ " vectorizer = TfidfVectorizer()\n",
+ " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
+ " \n",
+ " # 计算余弦相似度\n",
+ " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
+ " \n",
+ " # 找到最佳匹配\n",
+ " best_idx_tfidf = np.argmax(cosine_sim)\n",
+ " best_score_tfidf = cosine_sim[best_idx_tfidf]\n",
+ " results.append((candidates[best_idx_tfidf], best_score_tfidf, \"TF-IDF\"))\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF计算失败: {e}\")\n",
+ " \n",
+ " # 2. Levenshtein距离(编辑距离)\n",
+ " try:\n",
+ " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n",
+ " # 将距离转换为相似度分数(越小越相似)\n",
+ " max_len = max(len(query), max(len(c) for c in candidates))\n",
+ " lev_similarities = [1 - dist/max_len for dist in lev_distances]\n",
+ " \n",
+ " best_idx_lev = np.argmax(lev_similarities)\n",
+ " best_score_lev = lev_similarities[best_idx_lev]\n",
+ " results.append((candidates[best_idx_lev], best_score_lev, \"Levenshtein\"))\n",
+ " except Exception as e:\n",
+ " print(f\"Levenshtein计算失败: {e}\")\n",
+ " \n",
+ " # 3. FuzzyWuzzy比率\n",
+ " try:\n",
+ " fuzzy_ratios = [fuzz.ratio(query, c)/100 for c in candidates]\n",
+ " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n",
+ " best_score_fuzzy = fuzzy_ratios[best_idx_fuzzy]\n",
+ " results.append((candidates[best_idx_fuzzy], best_score_fuzzy, \"FuzzyWuzzy\"))\n",
+ " except Exception as e:\n",
+ " print(f\"FuzzyWuzzy计算失败: {e}\")\n",
+ " \n",
+ " # 4. FuzzyWuzzy部分比率(处理子字符串)\n",
+ " try:\n",
+ " partial_ratios = [fuzz.partial_ratio(query, c)/100 for c in candidates]\n",
+ " best_idx_partial = np.argmax(partial_ratios)\n",
+ " best_score_partial = partial_ratios[best_idx_partial]\n",
+ " results.append((candidates[best_idx_partial], best_score_partial, \"Partial\"))\n",
+ " except Exception as e:\n",
+ " print(f\"Partial比率计算失败: {e}\")\n",
+ " \n",
+ " # 5. FuzzyWuzzy令牌排序比率(处理词序不同)\n",
+ " try:\n",
+ " token_sort_ratios = [fuzz.token_sort_ratio(query, c)/100 for c in candidates]\n",
+ " best_idx_token = np.argmax(token_sort_ratios)\n",
+ " best_score_token = token_sort_ratios[best_idx_token]\n",
+ " results.append((candidates[best_idx_token], best_score_token, \"TokenSort\"))\n",
+ " except Exception as e:\n",
+ " print(f\"TokenSort比率计算失败: {e}\")\n",
+ " \n",
+ " # 找出所有方法中得分最高的结果\n",
+ " best_result = max(results, key=lambda x: x[1]) if results else (None, 0, None)\n",
+ " \n",
+ " return best_result\n",
+ " \n",
+ " # 对每个测试字段进行匹配\n",
+ " matched_annotations = []\n",
+ " matched_scores = []\n",
+ " matched_methods = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " for i, query in enumerate(combined_fields):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n",
+ " \n",
+ " # 计算多种相似度并选择最佳匹配\n",
+ " best_match, best_score, best_method = calculate_similarities(query, regular_annotations)\n",
+ " \n",
+ " # 如果相似度低于阈值,标记为未匹配\n",
+ " if best_score < 0.6: # 可以调整这个阈值\n",
+ " matched_annotations.append(\"未匹配\")\n",
+ " matched_scores.append(0.0)\n",
+ " matched_methods.append(\"None\")\n",
+ " else:\n",
+ " matched_annotations.append(best_match)\n",
+ " matched_scores.append(best_score)\n",
+ " matched_methods.append(best_method)\n",
+ " \n",
+ " # 获取ValueItemKind列的位置\n",
+ " kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
+ " \n",
+ " # 在ValueItemKind列前插入新的列\n",
+ " test_df.insert(kind_idx, 'Matched_Method', matched_methods)\n",
+ " test_df.insert(kind_idx, 'Matched_Score', matched_scores)\n",
+ " test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n",
+ " \n",
+ " # 保存结果\n",
+ " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_multi.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " # 打印匹配结果统计\n",
+ " print(\"\\n匹配结果统计:\")\n",
+ " print(pd.Series(matched_annotations).value_counts().head(10))\n",
+ " \n",
+ " # 打印方法使用统计\n",
+ " print(\"\\n匹配方法使用统计:\")\n",
+ " print(pd.Series(matched_methods).value_counts())\n",
+ " \n",
+ " # 打印前5行匹配结果示例\n",
+ " print(\"\\n前5行匹配结果示例:\")\n",
+ " for i in range(min(5, len(test_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"匹配注释: {test_df.iloc[i]['Matched_Annotation']}\")\n",
+ " print(f\"相似度分数: {test_df.iloc[i]['Matched_Score']:.4f}\")\n",
+ " print(f\"匹配方法: {test_df.iloc[i]['Matched_Method']}\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n",
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "各方法匹配结果统计:\n",
+ "TF-IDF匹配结果:\n",
+ "主刀医师 15\n",
+ "卡号 14\n",
+ "手术经过 13\n",
+ "既往史-手术外伤史-手术史-手术时间 10\n",
+ "患者姓名 7\n",
+ "患者基本情况 6\n",
+ "手术名称 5\n",
+ "麻醉方式 5\n",
+ "参与人员 4\n",
+ "主诉 3\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "Levenshtein匹配结果:\n",
+ "既往史-手术外伤史-手术史-有无 13\n",
+ "体格检查-血压-收缩压 10\n",
+ "既往史-手术外伤史-手术史-手术时间 10\n",
+ "入院时情况-患者姓名 8\n",
+ "主要辅助检查-实验室检查-Na 8\n",
+ "主治医师签名 6\n",
+ "既往史-手术外伤史-手术史-手术名称 5\n",
+ "患者性别 5\n",
+ "主治医师签名时间 5\n",
+ "个人史-饮酒史-主要饮酒种类 5\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "FuzzyWuzzy匹配结果:\n",
+ "讨论经过-耳鼻喉科/眼科-具体手术方案 13\n",
+ "患者姓名 11\n",
+ "既往史-手术外伤史-手术史-手术时间 10\n",
+ "体格检查-血压-收缩压 9\n",
+ "主要辅助检查-实验室检查-Na 8\n",
+ "主治医师签名 6\n",
+ "发起科室参与人员-主任医师 5\n",
+ "麻醉方式 5\n",
+ "患者性别 5\n",
+ "既往史-手术外伤史-手术史-手术名称 5\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "前5行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "TF-IDF匹配: 患者姓名\n",
+ "Levenshtein匹配: 患者姓名\n",
+ "FuzzyWuzzy匹配: 患者姓名\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "TF-IDF匹配: 患者性别\n",
+ "Levenshtein匹配: 患者姓名\n",
+ "FuzzyWuzzy匹配: 患者姓名\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "TF-IDF匹配: 患者年龄\n",
+ "Levenshtein匹配: 患者姓名\n",
+ "FuzzyWuzzy匹配: 患者姓名\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-主诉-主诉\n",
+ "TF-IDF匹配: 主诉\n",
+ "Levenshtein匹配: 入院时情况-主诉\n",
+ "FuzzyWuzzy匹配: 一般情况-主要症状及体征-主诉\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-入院日期-入院日期\n",
+ "TF-IDF匹配: 入院时情况-入院时间\n",
+ "Levenshtein匹配: 病理报告-检查日期\n",
+ "FuzzyWuzzy匹配: 现病史-外院手术日期\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "import Levenshtein\n",
+ "from fuzzywuzzy import fuzz\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 定义多种相似度计算方法\n",
+ " def calculate_similarities(query, candidates):\n",
+ " \"\"\"计算多种相似度指标,返回每种方法的最佳匹配\"\"\"\n",
+ " results = {}\n",
+ " \n",
+ " # 1. TF-IDF + 余弦相似度\n",
+ " try:\n",
+ " # 对中文文本进行分词\n",
+ " segmented_query = ' '.join(jieba.cut(query))\n",
+ " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
+ " \n",
+ " # 计算TF-IDF向量\n",
+ " vectorizer = TfidfVectorizer()\n",
+ " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
+ " \n",
+ " # 计算余弦相似度\n",
+ " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
+ " \n",
+ " # 找到最佳匹配\n",
+ " best_idx_tfidf = np.argmax(cosine_sim)\n",
+ " results['TF-IDF'] = candidates[best_idx_tfidf]\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF计算失败: {e}\")\n",
+ " results['TF-IDF'] = \"未匹配\"\n",
+ " \n",
+ " # 2. Levenshtein距离(编辑距离)\n",
+ " try:\n",
+ " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n",
+ " # 将距离转换为相似度分数(越小越相似)\n",
+ " best_idx_lev = np.argmin(lev_distances)\n",
+ " results['Levenshtein'] = candidates[best_idx_lev]\n",
+ " except Exception as e:\n",
+ " print(f\"Levenshtein计算失败: {e}\")\n",
+ " results['Levenshtein'] = \"未匹配\"\n",
+ " \n",
+ " # 3. FuzzyWuzzy比率\n",
+ " try:\n",
+ " fuzzy_ratios = [fuzz.ratio(query, c) for c in candidates]\n",
+ " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n",
+ " results['FuzzyWuzzy'] = candidates[best_idx_fuzzy]\n",
+ " except Exception as e:\n",
+ " print(f\"FuzzyWuzzy计算失败: {e}\")\n",
+ " results['FuzzyWuzzy'] = \"未匹配\"\n",
+ " \n",
+ " # 4. FuzzyWuzzy部分比率(处理子字符串)\n",
+ " try:\n",
+ " partial_ratios = [fuzz.partial_ratio(query, c) for c in candidates]\n",
+ " best_idx_partial = np.argmax(partial_ratios)\n",
+ " results['Partial'] = candidates[best_idx_partial]\n",
+ " except Exception as e:\n",
+ " print(f\"Partial比率计算失败: {e}\")\n",
+ " results['Partial'] = \"未匹配\"\n",
+ " \n",
+ " # 5. FuzzyWuzzy令牌排序比率(处理词序不同)\n",
+ " try:\n",
+ " token_sort_ratios = [fuzz.token_sort_ratio(query, c) for c in candidates]\n",
+ " best_idx_token = np.argmax(token_sort_ratios)\n",
+ " results['TokenSort'] = candidates[best_idx_token]\n",
+ " except Exception as e:\n",
+ " print(f\"TokenSort比率计算失败: {e}\")\n",
+ " results['TokenSort'] = \"未匹配\"\n",
+ " \n",
+ " return results\n",
+ " \n",
+ " # 对每个测试字段进行匹配\n",
+ " tfidf_matches = []\n",
+ " levenshtein_matches = []\n",
+ " fuzzywuzzy_matches = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " for i, query in enumerate(combined_fields):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n",
+ " \n",
+ " # 计算多种相似度\n",
+ " matches = calculate_similarities(query, regular_annotations)\n",
+ " \n",
+ " # 保存各种方法的匹配结果\n",
+ " tfidf_matches.append(matches.get('TF-IDF', \"未匹配\"))\n",
+ " levenshtein_matches.append(matches.get('Levenshtein', \"未匹配\"))\n",
+ " fuzzywuzzy_matches.append(matches.get('FuzzyWuzzy', \"未匹配\"))\n",
+ " \n",
+ " # 获取ValueItemKind列的位置\n",
+ " kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
+ " \n",
+ " # 在ValueItemKind列前插入新的列\n",
+ " test_df.insert(kind_idx, 'FuzzyWuzzy_Match', fuzzywuzzy_matches)\n",
+ " test_df.insert(kind_idx, 'Levenshtein_Match', levenshtein_matches)\n",
+ " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n",
+ " \n",
+ " # 保存结果\n",
+ " test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_all.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " # 打印匹配结果统计\n",
+ " print(\"\\n各方法匹配结果统计:\")\n",
+ " print(\"TF-IDF匹配结果:\")\n",
+ " print(pd.Series(tfidf_matches).value_counts().head(10))\n",
+ " print(\"\\nLevenshtein匹配结果:\")\n",
+ " print(pd.Series(levenshtein_matches).value_counts().head(10))\n",
+ " print(\"\\nFuzzyWuzzy匹配结果:\")\n",
+ " print(pd.Series(fuzzywuzzy_matches).value_counts().head(10))\n",
+ " \n",
+ " # 打印前5行匹配结果示例\n",
+ " print(\"\\n前5行匹配结果示例:\")\n",
+ " for i in range(min(5, len(test_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n",
+ " print(f\"Levenshtein匹配: {test_df.iloc[i]['Levenshtein_Match']}\")\n",
+ " print(f\"FuzzyWuzzy匹配: {test_df.iloc[i]['FuzzyWuzzy_Match']}\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "FileNotFoundError",
+ "evalue": "[Errno 2] No such file or directory: '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m 17\u001b[0m test_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 18\u001b[0m regular_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/limeng/SICT/lung_test/data/regular.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 20\u001b[0m test_encoding \u001b[38;5;241m=\u001b[39m \u001b[43mdetect_encoding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtest_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m regular_encoding \u001b[38;5;241m=\u001b[39m detect_encoding(regular_file)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m测试文件编码: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_encoding\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+ "Cell \u001b[0;32mIn[1], line 12\u001b[0m, in \u001b[0;36mdetect_encoding\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdetect_encoding\u001b[39m(file_path):\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 13\u001b[0m result \u001b[38;5;241m=\u001b[39m chardet\u001b[38;5;241m.\u001b[39mdetect(f\u001b[38;5;241m.\u001b[39mread())\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
+ "File \u001b[0;32m~/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 322\u001b[0m )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "import Levenshtein\n",
+ "from fuzzywuzzy import fuzz\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " paragraph_names = []\n",
+ " statement_names = []\n",
+ " value_item_names = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " paragraph_names.append(row['ParagraphName'])\n",
+ " statement_names.append(row['StatementName'])\n",
+ " value_item_names.append(row['ValueItemName'])\n",
+ " \n",
+ " # 定义多种相似度计算方法\n",
+ " def calculate_similarities(query, candidates):\n",
+ " \"\"\"计算多种相似度指标,返回每种方法的最佳匹配和分数\"\"\"\n",
+ " results = {}\n",
+ " scores = {}\n",
+ " \n",
+ " # 1. TF-IDF + 余弦相似度\n",
+ " try:\n",
+ " # 对中文文本进行分词\n",
+ " segmented_query = ' '.join(jieba.cut(query))\n",
+ " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
+ " \n",
+ " # 计算TF-IDF向量\n",
+ " vectorizer = TfidfVectorizer()\n",
+ " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
+ " \n",
+ " # 计算余弦相似度\n",
+ " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
+ " \n",
+ " # 找到最佳匹配\n",
+ " best_idx_tfidf = np.argmax(cosine_sim)\n",
+ " results['TF-IDF'] = candidates[best_idx_tfidf]\n",
+ " scores['TF-IDF'] = cosine_sim[best_idx_tfidf]\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF计算失败: {e}\")\n",
+ " results['TF-IDF'] = \"未匹配\"\n",
+ " scores['TF-IDF'] = 0.0\n",
+ " \n",
+ " # 2. Levenshtein距离(编辑距离)\n",
+ " try:\n",
+ " lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n",
+ " # 将距离转换为相似度分数(越小越相似)\n",
+ " max_len = max(len(query), max(len(c) for c in candidates))\n",
+ " lev_similarities = [1 - dist/max_len for dist in lev_distances]\n",
+ " \n",
+ " best_idx_lev = np.argmax(lev_similarities)\n",
+ " results['Levenshtein'] = candidates[best_idx_lev]\n",
+ " scores['Levenshtein'] = lev_similarities[best_idx_lev]\n",
+ " except Exception as e:\n",
+ " print(f\"Levenshtein计算失败: {e}\")\n",
+ " results['Levenshtein'] = \"未匹配\"\n",
+ " scores['Levenshtein'] = 0.0\n",
+ " \n",
+ " # 3. FuzzyWuzzy比率\n",
+ " try:\n",
+ " fuzzy_ratios = [fuzz.ratio(query, c)/100 for c in candidates]\n",
+ " best_idx_fuzzy = np.argmax(fuzzy_ratios)\n",
+ " results['FuzzyWuzzy'] = candidates[best_idx_fuzzy]\n",
+ " scores['FuzzyWuzzy'] = fuzzy_ratios[best_idx_fuzzy]\n",
+ " except Exception as e:\n",
+ " print(f\"FuzzyWuzzy计算失败: {e}\")\n",
+ " results['FuzzyWuzzy'] = \"未匹配\"\n",
+ " scores['FuzzyWuzzy'] = 0.0\n",
+ " \n",
+ " return results, scores\n",
+ " \n",
+ " # 对每个测试字段进行匹配\n",
+ " tfidf_matches = []\n",
+ " levenshtein_matches = []\n",
+ " fuzzywuzzy_matches = []\n",
+ " best_matches = []\n",
+ " best_match_methods = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " for i in range(len(combined_fields)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " paragraph_name = paragraph_names[i]\n",
+ " value_item_name = value_item_names[i]\n",
+ " \n",
+ " # 1. 首先检查是否有注释包含ParagraphName\n",
+ " paragraph_matches = [ann for ann in regular_annotations if paragraph_name in ann]\n",
+ " \n",
+ " if paragraph_matches:\n",
+ " # 2. 如果有包含ParagraphName的注释,再检查是否有同时包含ValueItemName的\n",
+ " value_matches = [ann for ann in paragraph_matches if value_item_name in ann]\n",
+ " if value_matches:\n",
+ " # 找到同时包含ParagraphName和ValueItemName的注释\n",
+ " best_match = value_matches[0] # 取第一个匹配\n",
+ " best_match_method = \"精确匹配(段落+值)\"\n",
+ " else:\n",
+ " # 只找到包含ParagraphName的注释\n",
+ " best_match = paragraph_matches[0] # 取第一个匹配\n",
+ " best_match_method = \"段落匹配\"\n",
+ " else:\n",
+ " # 3. 如果没有包含ParagraphName的注释,直接使用相似度指标\n",
+ " matches, scores = calculate_similarities(query, regular_annotations)\n",
+ " \n",
+ " # 选择得分最高的方法\n",
+ " best_method = max(scores.items(), key=lambda x: x[1])[0]\n",
+ " best_match = matches[best_method]\n",
+ " best_match_method = f\"相似度({best_method})\"\n",
+ " \n",
+ " # 计算相似度匹配以便比较\n",
+ " matches, _ = calculate_similarities(query, regular_annotations)\n",
+ " tfidf_matches.append(matches.get('TF-IDF', \"未匹配\"))\n",
+ " levenshtein_matches.append(matches.get('Levenshtein', \"未匹配\"))\n",
+ " fuzzywuzzy_matches.append(matches.get('FuzzyWuzzy', \"未匹配\"))\n",
+ " \n",
+ " best_matches.append(best_match)\n",
+ " best_match_methods.append(best_match_method)\n",
+ " \n",
+ " # 获取ValueItemKind列的位置\n",
+ " kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
+ " \n",
+ " # 在ValueItemKind列前插入新的列\n",
+ " test_df.insert(kind_idx, 'Best_Match_Method', best_match_methods)\n",
+ " test_df.insert(kind_idx, 'Best_Match', best_matches)\n",
+ " test_df.insert(kind_idx, 'FuzzyWuzzy_Match', fuzzywuzzy_matches)\n",
+ " test_df.insert(kind_idx, 'Levenshtein_Match', levenshtein_matches)\n",
+ " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n",
+ " \n",
+ " # 保存结果\n",
+ " test_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_with_annotations_all3.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " # 打印匹配结果统计\n",
+ " print(\"\\n最佳匹配方法统计:\")\n",
+ " print(pd.Series(best_match_methods).value_counts())\n",
+ " \n",
+ " print(\"\\n各方法匹配结果统计:\")\n",
+ " print(\"最佳匹配结果:\")\n",
+ " print(pd.Series(best_matches).value_counts().head(10))\n",
+ " print(\"\\nTF-IDF匹配结果:\")\n",
+ " print(pd.Series(tfidf_matches).value_counts().head(10))\n",
+ " print(\"\\nLevenshtein匹配结果:\")\n",
+ " print(pd.Series(levenshtein_matches).value_counts().head(10))\n",
+ " print(\"\\nFuzzyWuzzy匹配结果:\")\n",
+ " print(pd.Series(fuzzywuzzy_matches).value_counts().head(10))\n",
+ " \n",
+ " # 打印前5行匹配结果示例\n",
+ " print(\"\\n前5行匹配结果示例:\")\n",
+ " for i in range(min(5, len(test_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"最佳匹配: {test_df.iloc[i]['Best_Match']} (方法: {test_df.iloc[i]['Best_Match_Method']})\")\n",
+ " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n",
+ " print(f\"Levenshtein匹配: {test_df.iloc[i]['Levenshtein_Match']}\")\n",
+ " print(f\"FuzzyWuzzy匹配: {test_df.iloc[i]['FuzzyWuzzy_Match']}\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Building prefix dict from the default dictionary ...\n",
+ "Loading model from cache /tmp/jieba.cache\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading model cost 1.175 seconds.\n",
+ "Prefix dict has been built successfully.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "最佳匹配方法统计:\n",
+ "TF-IDF相似度 89\n",
+ "段落匹配 32\n",
+ "精确匹配(段落+值) 6\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "匹配结果统计:\n",
+ "最佳匹配结果:\n",
+ "体格检查-体温 20\n",
+ "主刀医师 15\n",
+ "卡号 14\n",
+ "手术经过 13\n",
+ "既往史-手术外伤史-手术史-手术时间 10\n",
+ "患者姓名 7\n",
+ "患者基本情况 6\n",
+ "现病史-发病日期 6\n",
+ "麻醉方式 5\n",
+ "手术名称 5\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "TF-IDF匹配结果:\n",
+ "主刀医师 15\n",
+ "卡号 14\n",
+ "手术经过 13\n",
+ "既往史-手术外伤史-手术史-手术时间 10\n",
+ "患者姓名 7\n",
+ "患者基本情况 6\n",
+ "手术名称 5\n",
+ "麻醉方式 5\n",
+ "参与人员 4\n",
+ "主诉 3\n",
+ "Name: count, dtype: int64\n",
+ "\n",
+ "前5行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "最佳匹配: 患者姓名 (方法: TF-IDF相似度)\n",
+ "相似度分数: 0.5819\n",
+ "TF-IDF匹配: 患者姓名\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "最佳匹配: 患者性别 (方法: TF-IDF相似度)\n",
+ "相似度分数: 0.6637\n",
+ "TF-IDF匹配: 患者性别\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "最佳匹配: 患者年龄 (方法: TF-IDF相似度)\n",
+ "相似度分数: 0.6576\n",
+ "TF-IDF匹配: 患者年龄\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-主诉-主诉\n",
+ "最佳匹配: 主诉 (方法: TF-IDF相似度)\n",
+ "相似度分数: 0.7628\n",
+ "TF-IDF匹配: 主诉\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-入院日期-入院日期\n",
+ "最佳匹配: 入院时情况-入院时间 (方法: TF-IDF相似度)\n",
+ "相似度分数: 0.5297\n",
+ "TF-IDF匹配: 入院时情况-入院时间\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " paragraph_names = []\n",
+ " statement_names = []\n",
+ " value_item_names = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " paragraph_names.append(row['ParagraphName'])\n",
+ " statement_names.append(row['StatementName'])\n",
+ " value_item_names.append(row['ValueItemName'])\n",
+ " \n",
+ " # 定义TF-IDF相似度计算方法\n",
+ " def calculate_tfidf_similarity(query, candidates):\n",
+ " \"\"\"计算TF-IDF相似度,返回最佳匹配和分数\"\"\"\n",
+ " try:\n",
+ " # 对中文文本进行分词\n",
+ " segmented_query = ' '.join(jieba.cut(query))\n",
+ " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
+ " \n",
+ " # 计算TF-IDF向量\n",
+ " vectorizer = TfidfVectorizer()\n",
+ " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
+ " \n",
+ " # 计算余弦相似度\n",
+ " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
+ " \n",
+ " # 找到最佳匹配\n",
+ " best_idx = np.argmax(cosine_sim)\n",
+ " return candidates[best_idx], cosine_sim[best_idx]\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF计算失败: {e}\")\n",
+ " return \"未匹配\", 0.0\n",
+ " \n",
+ " # 对每个测试字段进行匹配\n",
+ " tfidf_matches = []\n",
+ " best_matches = []\n",
+ " best_match_methods = []\n",
+ " similarity_scores = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " for i in range(len(combined_fields)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " paragraph_name = paragraph_names[i]\n",
+ " value_item_name = value_item_names[i]\n",
+ " \n",
+ " # 1. 首先检查是否有注释包含ParagraphName\n",
+ " paragraph_matches = [ann for ann in regular_annotations if paragraph_name in ann]\n",
+ " \n",
+ " if paragraph_matches:\n",
+ " # 2. 如果有包含ParagraphName的注释,再检查是否有同时包含ValueItemName的\n",
+ " value_matches = [ann for ann in paragraph_matches if value_item_name in ann]\n",
+ " if value_matches:\n",
+ " # 找到同时包含ParagraphName和ValueItemName的注释\n",
+ " best_match = value_matches[0] # 取第一个匹配\n",
+ " best_match_method = \"精确匹配(段落+值)\"\n",
+ " similarity_score = 1.0 # 精确匹配给予最高分\n",
+ " else:\n",
+ " # 只找到包含ParagraphName的注释\n",
+ " best_match = paragraph_matches[0] # 取第一个匹配\n",
+ " best_match_method = \"段落匹配\"\n",
+ " similarity_score = 0.8 # 段落匹配给予较高分\n",
+ " else:\n",
+ " # 3. 如果没有包含ParagraphName的注释,使用TF-IDF相似度\n",
+ " best_match, similarity_score = calculate_tfidf_similarity(query, regular_annotations)\n",
+ " best_match_method = \"TF-IDF相似度\"\n",
+ " \n",
+ " # 计算TF-IDF匹配以便比较\n",
+ " tfidf_match, _ = calculate_tfidf_similarity(query, regular_annotations)\n",
+ " tfidf_matches.append(tfidf_match)\n",
+ " \n",
+ " best_matches.append(best_match)\n",
+ " best_match_methods.append(best_match_method)\n",
+ " similarity_scores.append(similarity_score)\n",
+ " \n",
+ " # 获取ValueItemKind列的位置\n",
+ " kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
+ " \n",
+ " # 在ValueItemKind列前插入新的列\n",
+ " test_df.insert(kind_idx, 'Similarity_Score', similarity_scores)\n",
+ " test_df.insert(kind_idx, 'Best_Match_Method', best_match_methods)\n",
+ " test_df.insert(kind_idx, 'Best_Match', best_matches)\n",
+ " test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n",
+ " \n",
+ " # 保存结果\n",
+ " test_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_with_tfidf.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " # 打印匹配结果统计\n",
+ " print(\"\\n最佳匹配方法统计:\")\n",
+ " print(pd.Series(best_match_methods).value_counts())\n",
+ " \n",
+ " print(\"\\n匹配结果统计:\")\n",
+ " print(\"最佳匹配结果:\")\n",
+ " print(pd.Series(best_matches).value_counts().head(10))\n",
+ " print(\"\\nTF-IDF匹配结果:\")\n",
+ " print(pd.Series(tfidf_matches).value_counts().head(10))\n",
+ " \n",
+ " # 打印前5行匹配结果示例\n",
+ " print(\"\\n前5行匹配结果示例:\")\n",
+ " for i in range(min(5, len(test_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"最佳匹配: {test_df.iloc[i]['Best_Match']} (方法: {test_df.iloc[i]['Best_Match_Method']})\")\n",
+ " print(f\"相似度分数: {test_df.iloc[i]['Similarity_Score']:.4f}\")\n",
+ " print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n",
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "匹配完成,共处理 127 条记录\n",
+ "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\n",
+ "\n",
+ "前3行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "匹配1: 患者姓名 (分数: 0.5819)\n",
+ "匹配2: 患者姓名 (分数: 0.5819)\n",
+ "匹配3: 姓名 (分数: 0.5574)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "匹配1: 患者性别 (分数: 0.6637)\n",
+ "匹配2: 性别 (分数: 0.6416)\n",
+ "匹配3: 性别 (分数: 0.6416)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "匹配1: 患者年龄 (分数: 0.6576)\n",
+ "匹配2: 年龄 (分数: 0.6348)\n",
+ "匹配3: 年龄 (分数: 0.6348)\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 定义TF-IDF相似度计算方法,返回前3个最佳匹配\n",
+ " def calculate_top3_tfidf_similarity(query, candidates):\n",
+ " \"\"\"计算TF-IDF相似度,返回前3个最佳匹配和分数\"\"\"\n",
+ " try:\n",
+ " # 对中文文本进行分词\n",
+ " segmented_query = ' '.join(jieba.cut(query))\n",
+ " segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
+ " \n",
+ " # 计算TF-IDF向量\n",
+ " vectorizer = TfidfVectorizer()\n",
+ " tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
+ " \n",
+ " # 计算余弦相似度\n",
+ " cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
+ " \n",
+ " # 找到前3个最佳匹配\n",
+ " top3_indices = np.argsort(cosine_sim)[::-1][:3]\n",
+ " top3_scores = cosine_sim[top3_indices]\n",
+ " \n",
+ " return top3_indices, top3_scores\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF计算失败: {e}\")\n",
+ " return [-1, -1, -1], [0.0, 0.0, 0.0]\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_data = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " for i in range(len(test_df)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " \n",
+ " # 使用TF-IDF相似度匹配,获取前3个最佳匹配\n",
+ " top3_indices, top3_scores = calculate_top3_tfidf_similarity(query, regular_annotations)\n",
+ " \n",
+ " # 获取测试数据的相关字段\n",
+ " paragraph_name = test_df.iloc[i]['ParagraphName']\n",
+ " statement_name = test_df.iloc[i]['StatementName']\n",
+ " value_item_name = test_df.iloc[i]['ValueItemName']\n",
+ " display_string = test_df.iloc[i]['DisplayString']\n",
+ " \n",
+ " # 获取前3个规范数据的相关字段\n",
+ " regular_nodes = []\n",
+ " regular_annotations_matched = []\n",
+ " regular_descriptions = []\n",
+ " \n",
+ " for idx, score in zip(top3_indices, top3_scores):\n",
+ " if idx >= 0:\n",
+ " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n",
+ " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n",
+ " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n",
+ " else:\n",
+ " regular_nodes.append(\"未匹配\")\n",
+ " regular_annotations_matched.append(\"未匹配\")\n",
+ " regular_descriptions.append(\"未匹配\")\n",
+ " \n",
+ " # 确保有3个结果(如果候选项少于3个)\n",
+ " while len(regular_nodes) < 3:\n",
+ " regular_nodes.append(\"未匹配\")\n",
+ " regular_annotations_matched.append(\"未匹配\")\n",
+ " regular_descriptions.append(\"未匹配\")\n",
+ " top3_scores = np.append(top3_scores, 0.0)\n",
+ " \n",
+ " # 添加到结果数据\n",
+ " result_data.append({\n",
+ " 'ParagraphName': paragraph_name,\n",
+ " 'StatementName': statement_name,\n",
+ " 'ValueItemName': value_item_name,\n",
+ " 'DisplayString': display_string,\n",
+ " '规范节点名1': regular_nodes[0],\n",
+ " '规范注释1': regular_annotations_matched[0],\n",
+ " '规范说明1': regular_descriptions[0],\n",
+ " '相似度分数1': top3_scores[0],\n",
+ " '规范节点名2': regular_nodes[1],\n",
+ " '规范注释2': regular_annotations_matched[1],\n",
+ " '规范说明2': regular_descriptions[1],\n",
+ " '相似度分数2': top3_scores[1],\n",
+ " '规范节点名3': regular_nodes[2],\n",
+ " '规范注释3': regular_annotations_matched[2],\n",
+ " '规范说明3': regular_descriptions[2],\n",
+ " '相似度分数3': top3_scores[2]\n",
+ " })\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_df = pd.DataFrame(result_data)\n",
+ " \n",
+ " # 保存结果\n",
+ " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n",
+ " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\")\n",
+ " \n",
+ " # 打印前3行匹配结果示例\n",
+ " print(\"\\n前3行匹配结果示例:\")\n",
+ " for i in range(min(3, len(result_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n",
+ " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n",
+ " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "加载Sentence Transformer模型...\n",
+ "对规范注释进行编码...\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n",
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "匹配完成,共处理 127 条记录\n",
+ "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv\n",
+ "\n",
+ "前3行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "匹配1: 讨论经过-病理科-医师姓名 (分数: 0.8398)\n",
+ "匹配2: 讨论经过-放射科-医师姓名 (分数: 0.8398)\n",
+ "匹配3: 讨论经过-放化疗科-医师姓名 (分数: 0.8384)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "匹配1: 辅助检查-乙肝病毒-医院名称 (分数: 0.8360)\n",
+ "匹配2: 辅助检查-骨扫描检查-医院名称 (分数: 0.8325)\n",
+ "匹配3: 讨论经过-病理科-医师姓名 (分数: 0.8265)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "匹配1: 入院时情况-患者年龄 (分数: 0.8390)\n",
+ "匹配2: 辅助检查-乙肝病毒-医院名称 (分数: 0.8046)\n",
+ "匹配3: 辅助检查-骨扫描检查-医院名称 (分数: 0.8012)\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sentence_transformers import SentenceTransformer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import torch\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 加载预训练的中文Sentence Transformer模型\n",
+ " print(\"加载Sentence Transformer模型...\")\n",
+ " \n",
+ " # 如果都失败,尝试加载基础模型\n",
+ " model = SentenceTransformer('/home/limeng/SICT/lung_test/all-MiniLM-L6-v2')\n",
+ "\n",
+ " # 定义Sentence Transformer相似度计算方法,返回前3个最佳匹配\n",
+ " def calculate_top3_transformer_similarity(query, candidates, model):\n",
+ " \"\"\"计算Sentence Transformer相似度,返回前3个最佳匹配和分数\"\"\"\n",
+ " try:\n",
+ " # 编码查询和候选项\n",
+ " query_embedding = model.encode([query], convert_to_tensor=True)\n",
+ " candidate_embeddings = model.encode(candidates, convert_to_tensor=True)\n",
+ " \n",
+ " # 计算余弦相似度\n",
+ " cosine_scores = cosine_similarity(\n",
+ " query_embedding.cpu().numpy(), \n",
+ " candidate_embeddings.cpu().numpy()\n",
+ " )[0]\n",
+ " \n",
+ " # 找到前3个最佳匹配\n",
+ " top3_indices = np.argsort(cosine_scores)[::-1][:3]\n",
+ " top3_scores = cosine_scores[top3_indices]\n",
+ " \n",
+ " return top3_indices, top3_scores\n",
+ " except Exception as e:\n",
+ " print(f\"Transformer相似度计算失败: {e}\")\n",
+ " return [-1, -1, -1], [0.0, 0.0, 0.0]\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_data = []\n",
+ " \n",
+ " # 先对规范注释进行编码,避免重复计算\n",
+ " print(\"对规范注释进行编码...\")\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_data = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " for i in range(len(test_df)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " \n",
+ " # 使用Sentence Transformer相似度匹配,获取前3个最佳匹配\n",
+ " top3_indices, top3_scores = calculate_top3_transformer_similarity(query, regular_annotations, model)\n",
+ " \n",
+ " # 获取测试数据的相关字段\n",
+ " paragraph_name = test_df.iloc[i]['ParagraphName']\n",
+ " statement_name = test_df.iloc[i]['StatementName']\n",
+ " value_item_name = test_df.iloc[i]['ValueItemName']\n",
+ " display_string = test_df.iloc[i]['DisplayString']\n",
+ " \n",
+ " # 获取前3个规范数据的相关字段\n",
+ " regular_nodes = []\n",
+ " regular_annotations_matched = []\n",
+ " regular_descriptions = []\n",
+ " \n",
+ " for idx, score in zip(top3_indices, top3_scores):\n",
+ " if idx >= 0:\n",
+ " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n",
+ " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n",
+ " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n",
+ " else:\n",
+ " regular_nodes.append(\"未匹配\")\n",
+ " regular_annotations_matched.append(\"未匹配\")\n",
+ " regular_descriptions.append(\"未匹配\")\n",
+ " \n",
+ " # 确保有3个结果(如果候选项少于3个)\n",
+ " while len(regular_nodes) < 3:\n",
+ " regular_nodes.append(\"未匹配\")\n",
+ " regular_annotations_matched.append(\"未匹配\")\n",
+ " regular_descriptions.append(\"未匹配\")\n",
+ " top3_scores = np.append(top3_scores, 0.0)\n",
+ " \n",
+ " # 添加到结果数据\n",
+ " result_data.append({\n",
+ " 'ParagraphName': paragraph_name,\n",
+ " 'StatementName': statement_name,\n",
+ " 'ValueItemName': value_item_name,\n",
+ " 'DisplayString': display_string,\n",
+ " '规范节点名1': regular_nodes[0],\n",
+ " '规范注释1': regular_annotations_matched[0],\n",
+ " '规范说明1': regular_descriptions[0],\n",
+ " '相似度分数1': top3_scores[0],\n",
+ " '规范节点名2': regular_nodes[1],\n",
+ " '规范注释2': regular_annotations_matched[1],\n",
+ " '规范说明2': regular_descriptions[1],\n",
+ " '相似度分数2': top3_scores[1],\n",
+ " '规范节点名3': regular_nodes[2],\n",
+ " '规范注释3': regular_annotations_matched[2],\n",
+ " '规范说明3': regular_descriptions[2],\n",
+ " '相似度分数3': top3_scores[2]\n",
+ " })\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_df = pd.DataFrame(result_data)\n",
+ " \n",
+ " # 保存结果\n",
+ " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n",
+ " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv\")\n",
+ " \n",
+ " # 打印前3行匹配结果示例\n",
+ " print(\"\\n前3行匹配结果示例:\")\n",
+ " for i in range(min(3, len(result_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n",
+ " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n",
+ " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "初始化TF-IDF向量化器...\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
+ " warnings.warn(\n",
+ "Building prefix dict from the default dictionary ...\n",
+ "Dumping model to file cache /tmp/jieba.cache\n",
+ "Loading model cost 0.934 seconds.\n",
+ "Prefix dict has been built successfully.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "匹配完成,共处理 127 条记录\n",
+ "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\n",
+ "\n",
+ "前3行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "匹配1: 患者姓名 (分数: 0.4840)\n",
+ "匹配2: 患者姓名 (分数: 0.4840)\n",
+ "匹配3: 姓名 (分数: 0.4637)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "匹配1: 患者性别 (分数: 0.5414)\n",
+ "匹配2: 性别 (分数: 0.5235)\n",
+ "匹配3: 性别 (分数: 0.5235)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "匹配1: 患者年龄 (分数: 0.5357)\n",
+ "匹配2: 年龄 (分数: 0.5170)\n",
+ "匹配3: 年龄 (分数: 0.5170)\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 使用TF-IDF向量化文本\n",
+ " print(\"初始化TF-IDF向量化器...\")\n",
+ " \n",
+ " # 对中文文本进行分词处理\n",
+ " def tokenize_chinese(text):\n",
+ " return list(jieba.cut(text))\n",
+ " \n",
+ " # 初始化TF-IDF向量化器\n",
+ " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
+ " \n",
+ " # 定义TF-IDF相似度计算方法,返回前3个最佳匹配\n",
+ " def calculate_top3_tfidf_similarity(query, candidates, vectorizer):\n",
+ " \"\"\"计算TF-IDF相似度,返回前3个最佳匹配和分数\"\"\"\n",
+ " try:\n",
+ " # 将所有文本合并为一个列表进行向量化\n",
+ " all_texts = [query] + candidates\n",
+ " \n",
+ " # 拟合并转换所有文本\n",
+ " tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
+ " \n",
+ " # 计算查询与所有候选项的余弦相似度\n",
+ " query_vector = tfidf_matrix[0:1]\n",
+ " candidate_vectors = tfidf_matrix[1:]\n",
+ " \n",
+ " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
+ " \n",
+ " # 找到前3个最佳匹配\n",
+ " top3_indices = np.argsort(cosine_scores)[::-1][:3]\n",
+ " top3_scores = cosine_scores[top3_indices]\n",
+ " \n",
+ " return top3_indices, top3_scores\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF相似度计算失败: {e}\")\n",
+ " return [-1, -1, -1], [0.0, 0.0, 0.0]\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_data = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " for i in range(len(test_df)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " \n",
+ " # 使用TF-IDF相似度匹配,获取前3个最佳匹配\n",
+ " top3_indices, top3_scores = calculate_top3_tfidf_similarity(query, regular_annotations, tfidf_vectorizer)\n",
+ " \n",
+ " # 获取测试数据的相关字段\n",
+ " paragraph_name = test_df.iloc[i]['ParagraphName']\n",
+ " statement_name = test_df.iloc[i]['StatementName']\n",
+ " value_item_name = test_df.iloc[i]['ValueItemName']\n",
+ " display_string = test_df.iloc[i]['DisplayString']\n",
+ " \n",
+ " # 获取前3个规范数据的相关字段\n",
+ " regular_nodes = []\n",
+ " regular_annotations_matched = []\n",
+ " regular_descriptions = []\n",
+ " \n",
+ " for idx, score in zip(top3_indices, top3_scores):\n",
+ " if idx >= 0:\n",
+ " regular_nodes.append(regular_df.iloc[idx]['节点名'])\n",
+ " regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n",
+ " regular_descriptions.append(regular_df.iloc[idx]['说明'])\n",
+ " else:\n",
+ " regular_nodes.append(\"未匹配\")\n",
+ " regular_annotations_matched.append(\"未匹配\")\n",
+ " regular_descriptions.append(\"未匹配\")\n",
+ " \n",
+ " # 确保有3个结果(如果候选项少于3个)\n",
+ " while len(regular_nodes) < 3:\n",
+ " regular_nodes.append(\"未匹配\")\n",
+ " regular_annotations_matched.append(\"未匹配\")\n",
+ " regular_descriptions.append(\"未匹配\")\n",
+ " top3_scores = np.append(top3_scores, 0.0)\n",
+ " \n",
+ " # 添加到结果数据\n",
+ " result_data.append({\n",
+ " 'ParagraphName': paragraph_name,\n",
+ " 'StatementName': statement_name,\n",
+ " 'ValueItemName': value_item_name,\n",
+ " 'DisplayString': display_string,\n",
+ " '规范节点名1': regular_nodes[0],\n",
+ " '规范注释1': regular_annotations_matched[0],\n",
+ " '规范说明1': regular_descriptions[0],\n",
+ " '相似度分数1': top3_scores[0],\n",
+ " '规范节点名2': regular_nodes[1],\n",
+ " '规范注释2': regular_annotations_matched[1],\n",
+ " '规范说明2': regular_descriptions[1],\n",
+ " '相似度分数2': top3_scores[1],\n",
+ " '规范节点名3': regular_nodes[2],\n",
+ " '规范注释3': regular_annotations_matched[2],\n",
+ " '规范说明3': regular_descriptions[2],\n",
+ " '相似度分数3': top3_scores[2]\n",
+ " })\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_df = pd.DataFrame(result_data)\n",
+ " \n",
+ " # 保存结果\n",
+ " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n",
+ " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\")\n",
+ " \n",
+ " # 打印前3行匹配结果示例\n",
+ " print(\"\\n前3行匹配结果示例:\")\n",
+ " for i in range(min(3, len(result_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n",
+ " print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n",
+ " print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "初始化TF-IDF向量化器...\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "匹配完成,共处理 127 条记录\n",
+ "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv\n",
+ "\n",
+ "前3行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "TF-IDF匹配结果:\n",
+ " 匹配1: 患者姓名 (分数: 0.4840)\n",
+ " 匹配2: 患者姓名 (分数: 0.4840)\n",
+ " 匹配3: 姓名 (分数: 0.4637)\n",
+ "FuzzyWuzzy匹配结果:\n",
+ " 匹配1: 患者姓名 (分数: 0.3300)\n",
+ " 匹配2: 入院时情况-患者姓名 (分数: 0.3300)\n",
+ " 匹配3: 患者姓名 (分数: 0.3300)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "TF-IDF匹配结果:\n",
+ " 匹配1: 患者性别 (分数: 0.5414)\n",
+ " 匹配2: 性别 (分数: 0.5235)\n",
+ " 匹配3: 性别 (分数: 0.5235)\n",
+ "FuzzyWuzzy匹配结果:\n",
+ " 匹配1: 患者姓名 (分数: 0.3600)\n",
+ " 匹配2: 入院时情况-患者姓名 (分数: 0.3600)\n",
+ " 匹配3: 患者姓名 (分数: 0.3600)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "TF-IDF匹配结果:\n",
+ " 匹配1: 患者年龄 (分数: 0.5357)\n",
+ " 匹配2: 年龄 (分数: 0.5170)\n",
+ " 匹配3: 年龄 (分数: 0.5170)\n",
+ "FuzzyWuzzy匹配结果:\n",
+ " 匹配1: 患者姓名 (分数: 0.3600)\n",
+ " 匹配2: 入院时情况-患者姓名 (分数: 0.3600)\n",
+ " 匹配3: 患者姓名 (分数: 0.3600)\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "from fuzzywuzzy import fuzz\n",
+ "from fuzzywuzzy import process\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 使用TF-IDF向量化文本\n",
+ " print(\"初始化TF-IDF向量化器...\")\n",
+ " \n",
+ " # 对中文文本进行分词处理\n",
+ " def tokenize_chinese(text):\n",
+ " return list(jieba.cut(text))\n",
+ " \n",
+ " # 初始化TF-IDF向量化器\n",
+ " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
+ " \n",
+ " # 定义TF-IDF相似度计算方法,返回最佳匹配(最多3个)\n",
+ " def calculate_tfidf_similarity(query, candidates, vectorizer, max_matches=3, threshold=0.1):\n",
+ " \"\"\"计算TF-IDF相似度,返回最佳匹配(最多max_matches个)\"\"\"\n",
+ " try:\n",
+ " # 将所有文本合并为一个列表进行向量化\n",
+ " all_texts = [query] + candidates\n",
+ " \n",
+ " # 拟合并转换所有文本\n",
+ " tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
+ " \n",
+ " # 计算查询与所有候选项的余弦相似度\n",
+ " query_vector = tfidf_matrix[0:1]\n",
+ " candidate_vectors = tfidf_matrix[1:]\n",
+ " \n",
+ " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
+ " \n",
+ " # 找到相似度大于阈值的匹配\n",
+ " valid_indices = np.where(cosine_scores > threshold)[0]\n",
+ " \n",
+ " # 按相似度降序排序\n",
+ " sorted_indices = valid_indices[np.argsort(cosine_scores[valid_indices])[::-1]]\n",
+ " \n",
+ " # 最多取max_matches个\n",
+ " top_indices = sorted_indices[:max_matches]\n",
+ " top_scores = cosine_scores[top_indices]\n",
+ " \n",
+ " return top_indices, top_scores\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF相似度计算失败: {e}\")\n",
+ " return np.array([]), np.array([])\n",
+ " \n",
+ " # 定义FuzzyWuzzy相似度计算方法,返回最佳匹配(最多3个)\n",
+ " def calculate_fuzzy_similarity(query, candidates, max_matches=3):\n",
+ " \"\"\"计算FuzzyWuzzy相似度,返回最佳匹配(最多max_matches个)\"\"\"\n",
+ " try:\n",
+ " # 使用process.extract获取最佳匹配\n",
+ " matches = process.extract(query, candidates, limit=max_matches, scorer=fuzz.token_sort_ratio)\n",
+ " \n",
+ " # 提取索引和分数\n",
+ " indices = []\n",
+ " scores = []\n",
+ " \n",
+ " for match in matches:\n",
+ " # match格式为(匹配文本, 分数)\n",
+ " matched_text, score = match\n",
+ " # 找到匹配文本在原始列表中的索引\n",
+ " idx = candidates.index(matched_text)\n",
+ " indices.append(idx)\n",
+ " scores.append(score / 100.0) # 将分数归一化到0-1范围\n",
+ " \n",
+ " return np.array(indices), np.array(scores)\n",
+ " except Exception as e:\n",
+ " print(f\"FuzzyWuzzy相似度计算失败: {e}\")\n",
+ " return np.array([]), np.array([])\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_data = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " \n",
+ " for i in range(len(test_df)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " \n",
+ " # 使用TF-IDF相似度匹配,获取最佳匹配(最多3个)\n",
+ " tfidf_indices, tfidf_scores = calculate_tfidf_similarity(query, regular_annotations, tfidf_vectorizer)\n",
+ " \n",
+ " # 使用FuzzyWuzzy相似度匹配,获取最佳匹配(最多3个)\n",
+ " fuzzy_indices, fuzzy_scores = calculate_fuzzy_similarity(query, regular_annotations)\n",
+ " \n",
+ " # 获取测试数据的相关字段\n",
+ " paragraph_name = test_df.iloc[i]['ParagraphName']\n",
+ " statement_name = test_df.iloc[i]['StatementName']\n",
+ " value_item_name = test_df.iloc[i]['ValueItemName']\n",
+ " display_string = test_df.iloc[i]['DisplayString']\n",
+ " \n",
+ " # 创建结果字典\n",
+ " result_dict = {\n",
+ " 'ParagraphName': paragraph_name,\n",
+ " 'StatementName': statement_name,\n",
+ " 'ValueItemName': value_item_name,\n",
+ " 'DisplayString': display_string\n",
+ " }\n",
+ " \n",
+ " # 添加TF-IDF匹配结果\n",
+ " for j in range(min(3, len(tfidf_indices))):\n",
+ " idx = tfidf_indices[j]\n",
+ " score = tfidf_scores[j]\n",
+ " result_dict[f'TFIDF_规范节点名{j+1}'] = regular_df.iloc[idx]['节点名']\n",
+ " result_dict[f'TFIDF_规范注释{j+1}'] = regular_df.iloc[idx]['注释']\n",
+ " result_dict[f'TFIDF_规范说明{j+1}'] = regular_df.iloc[idx]['说明']\n",
+ " result_dict[f'TFIDF_相似度分数{j+1}'] = score\n",
+ " \n",
+ " # 添加FuzzyWuzzy匹配结果\n",
+ " for j in range(min(3, len(fuzzy_indices))):\n",
+ " idx = fuzzy_indices[j]\n",
+ " score = fuzzy_scores[j]\n",
+ " result_dict[f'Fuzzy_规范节点名{j+1}'] = regular_df.iloc[idx]['节点名']\n",
+ " result_dict[f'Fuzzy_规范注释{j+1}'] = regular_df.iloc[idx]['注释']\n",
+ " result_dict[f'Fuzzy_规范说明{j+1}'] = regular_df.iloc[idx]['说明']\n",
+ " result_dict[f'Fuzzy_相似度分数{j+1}'] = score\n",
+ " \n",
+ " # 添加到结果数据\n",
+ " result_data.append(result_dict)\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_df = pd.DataFrame(result_data)\n",
+ " \n",
+ " # 保存结果\n",
+ " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n",
+ " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv\")\n",
+ " \n",
+ " # 打印前3行匹配结果示例\n",
+ " print(\"\\n前3行匹配结果示例:\")\n",
+ " for i in range(min(3, len(result_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " \n",
+ " print(\"TF-IDF匹配结果:\")\n",
+ " for j in range(1, 4):\n",
+ " if f'TFIDF_规范注释{j}' in result_df.columns and not pd.isna(result_df.iloc[i].get(f'TFIDF_规范注释{j}', None)):\n",
+ " print(f\" 匹配{j}: {result_df.iloc[i][f'TFIDF_规范注释{j}']} (分数: {result_df.iloc[i][f'TFIDF_相似度分数{j}']:.4f})\")\n",
+ " \n",
+ " print(\"FuzzyWuzzy匹配结果:\")\n",
+ " for j in range(1, 4):\n",
+ " if f'Fuzzy_规范注释{j}' in result_df.columns and not pd.isna(result_df.iloc[i].get(f'Fuzzy_规范注释{j}', None)):\n",
+ " print(f\" 匹配{j}: {result_df.iloc[i][f'Fuzzy_规范注释{j}']} (分数: {result_df.iloc[i][f'Fuzzy_相似度分数{j}']:.4f})\")\n",
+ " \n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
+ " warnings.warn(\n",
+ "Building prefix dict from the default dictionary ...\n",
+ "Loading model from cache /tmp/jieba.cache\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "初始化TF-IDF向量化器...\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading model cost 0.942 seconds.\n",
+ "Prefix dict has been built successfully.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "匹配完成,共处理 127 条记录\n",
+ "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\n",
+ "\n",
+ "前3行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "最佳匹配: 患者姓名 (分数: 0.4840)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "最佳匹配: 患者性别 (分数: 0.5414)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "最佳匹配: 患者年龄 (分数: 0.5357)\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 使用TF-IDF向量化文本\n",
+ " print(\"初始化TF-IDF向量化器...\")\n",
+ " \n",
+ " # 对中文文本进行分词处理\n",
+ " def tokenize_chinese(text):\n",
+ " return list(jieba.cut(text))\n",
+ " \n",
+ " # 初始化TF-IDF向量化器\n",
+ " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
+ " \n",
+ " # 定义TF-IDF相似度计算方法,只返回最佳匹配(分数最高的)\n",
+ " def calculate_best_tfidf_match(query, candidates, vectorizer):\n",
+ " \"\"\"计算TF-IDF相似度,只返回最佳匹配(分数最高的)\"\"\"\n",
+ " try:\n",
+ " # 将所有文本合并为一个列表进行向量化\n",
+ " all_texts = [query] + candidates\n",
+ " \n",
+ " # 拟合并转换所有文本\n",
+ " tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
+ " \n",
+ " # 计算查询与所有候选项的余弦相似度\n",
+ " query_vector = tfidf_matrix[0:1]\n",
+ " candidate_vectors = tfidf_matrix[1:]\n",
+ " \n",
+ " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
+ " \n",
+ " # 找到分数最高的匹配\n",
+ " best_index = np.argmax(cosine_scores)\n",
+ " best_score = cosine_scores[best_index]\n",
+ " \n",
+ " return best_index, best_score\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF相似度计算失败: {e}\")\n",
+ " return -1, 0.0\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_data = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " \n",
+ " for i in range(len(test_df)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " \n",
+ " # 使用TF-IDF相似度匹配,只获取最佳匹配\n",
+ " best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n",
+ " \n",
+ " # 获取测试数据的相关字段\n",
+ " row = test_df.iloc[i]\n",
+ " \n",
+ " # 创建结果字典,包含原始字段\n",
+ " result_dict = {\n",
+ " 'ParagraphName': row['ParagraphName'],\n",
+ " 'StatementName': row['StatementName'],\n",
+ " 'ValueItemName': row['ValueItemName'],\n",
+ " 'DisplayString': row['DisplayString']\n",
+ " }\n",
+ " \n",
+ " # 添加SFZH, XGRQ, IPBLH字段(如果存在)\n",
+ " if 'SFZH' in test_df.columns:\n",
+ " result_dict['SFZH'] = row['SFZH']\n",
+ " if 'XGRQ' in test_df.columns:\n",
+ " result_dict['XGRQ'] = row['XGRQ']\n",
+ " if 'IPBLH' in test_df.columns:\n",
+ " result_dict['IPBLH'] = row['IPBLH']\n",
+ " \n",
+ " # 添加最佳TF-IDF匹配结果\n",
+ " if best_index >= 0:\n",
+ " result_dict['TFIDF_规范节点名'] = regular_df.iloc[best_index]['节点名']\n",
+ " result_dict['TFIDF_规范注释'] = regular_df.iloc[best_index]['注释']\n",
+ " result_dict['TFIDF_规范说明'] = regular_df.iloc[best_index]['说明']\n",
+ " result_dict['TFIDF_相似度分数'] = best_score\n",
+ " else:\n",
+ " result_dict['TFIDF_规范节点名'] = ''\n",
+ " result_dict['TFIDF_规范注释'] = ''\n",
+ " result_dict['TFIDF_规范说明'] = ''\n",
+ " result_dict['TFIDF_相似度分数'] = 0.0\n",
+ " \n",
+ " # 添加到结果数据\n",
+ " result_data.append(result_dict)\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_df = pd.DataFrame(result_data)\n",
+ " \n",
+ " # 保存结果\n",
+ " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n",
+ " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\")\n",
+ " \n",
+ " # 打印前3行匹配结果示例\n",
+ " print(\"\\n前3行匹配结果示例:\")\n",
+ " for i in range(min(3, len(result_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"最佳匹配: {result_df.iloc[i]['TFIDF_规范注释']} (分数: {result_df.iloc[i]['TFIDF_相似度分数']:.4f})\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n",
+ "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
+ "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n",
+ "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n",
+ "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n",
+ "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n",
+ "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00, ?it/s]\n",
+ "Loading safetensors checkpoint shards: 20% Completed | 1/5 [00:00<00:03, 1.24it/s]\n",
+ "Loading safetensors checkpoint shards: 40% Completed | 2/5 [00:03<00:04, 1.64s/it]\n",
+ "Loading safetensors checkpoint shards: 60% Completed | 3/5 [00:05<00:03, 1.92s/it]\n",
+ "Loading safetensors checkpoint shards: 80% Completed | 4/5 [00:08<00:02, 2.25s/it]\n",
+ "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00, 2.26s/it]\n",
+ "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00, 2.06s/it]\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 03-10 09:55:32 model_runner.py:1077] Loading model weights took 15.5028 GB\n",
+ "INFO 03-10 09:55:42 worker.py:232] Memory profiling results: total_gpu_memory=31.73GiB initial_memory_usage=15.89GiB peak_torch_memory=19.42GiB memory_usage_post_profile=15.96GiB non_torch_memory=0.45GiB kv_cache_size=8.69GiB gpu_memory_utilization=0.90\n",
+ "INFO 03-10 09:55:43 gpu_executor.py:113] # GPU blocks: 2965, # CPU blocks: 1365\n",
+ "INFO 03-10 09:55:43 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 1.45x\n",
+ "INFO 03-10 09:55:46 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+ "INFO 03-10 09:55:46 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+ "INFO 03-10 09:56:08 model_runner.py:1518] Graph capturing finished in 22 secs, took 0.59 GiB\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import json\n",
+ "# import time\n",
+ "os.environ[\"NCCL_P2P_DISABLE\"] = \"1\"\n",
+ "import torch \n",
+ "# import pynvml\n",
+ "from vllm import LLM, SamplingParams\n",
+ "import torch.distributed as dist\n",
+ "import re\n",
+ "torch.cuda.empty_cache()\n",
+ "os.environ[\"TRANSFORMERS_OFFLINE\"] = \"1\"\n",
+ "os.environ[\"HF_DATASETS_OFFLINE\"] = \"1\"\n",
+ "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
+ "\n",
+ "\n",
+ "\n",
+ "# 加载量化模型\n",
+ "llm = LLM(model=\"/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8\", dtype=\"half\",gpu_memory_utilization=0.9)\n",
+ "\n",
+ "# \n",
+ "sampling_params = SamplingParams(temperature=0.1, top_p=0.3, max_tokens = 256)\n",
+ "\n",
+ "# 推理示例\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n",
+ "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
+ "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n",
+ "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n",
+ "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n",
+ "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n",
+ "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00, ?it/s]\n",
+ "Loading safetensors checkpoint shards: 20% Completed | 1/5 [00:00<00:03, 1.24it/s]\n",
+ "Loading safetensors checkpoint shards: 40% Completed | 2/5 [00:03<00:04, 1.64s/it]\n",
+ "Loading safetensors checkpoint shards: 60% Completed | 3/5 [00:05<00:03, 1.92s/it]\n",
+ "Loading safetensors checkpoint shards: 80% Completed | 4/5 [00:08<00:02, 2.25s/it]\n",
+ "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00, 2.26s/it]\n",
+ "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00, 2.06s/it]\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 03-10 09:55:32 model_runner.py:1077] Loading model weights took 15.5028 GB\n",
+ "INFO 03-10 09:55:42 worker.py:232] Memory profiling results: total_gpu_memory=31.73GiB initial_memory_usage=15.89GiB peak_torch_memory=19.42GiB memory_usage_post_profile=15.96GiB non_torch_memory=0.45GiB kv_cache_size=8.69GiB gpu_memory_utilization=0.90\n",
+ "INFO 03-10 09:55:43 gpu_executor.py:113] # GPU blocks: 2965, # CPU blocks: 1365\n",
+ "INFO 03-10 09:55:43 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 1.45x\n",
+ "INFO 03-10 09:55:46 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+ "INFO 03-10 09:55:46 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+ "INFO 03-10 09:56:08 model_runner.py:1518] Graph capturing finished in 22 secs, took 0.59 GiB\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import json\n",
+ "# import time\n",
+ "os.environ[\"NCCL_P2P_DISABLE\"] = \"1\"\n",
+ "import torch \n",
+ "# import pynvml\n",
+ "from vllm import LLM, SamplingParams\n",
+ "import torch.distributed as dist\n",
+ "import re\n",
+ "torch.cuda.empty_cache()\n",
+ "os.environ[\"TRANSFORMERS_OFFLINE\"] = \"1\"\n",
+ "os.environ[\"HF_DATASETS_OFFLINE\"] = \"1\"\n",
+ "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
+ "\n",
+ "\n",
+ "\n",
+ "# 加载量化模型\n",
+ "llm = LLM(model=\"/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8\", dtype=\"half\",gpu_memory_utilization=0.9)\n",
+ "\n",
+ "# \n",
+ "sampling_params = SamplingParams(temperature=0.1, top_p=0.3, max_tokens = 256)\n",
+ "\n",
+ "# 推理示例\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "初始化TF-IDF向量化器...\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "匹配完成,共处理 127 条记录\n",
+ "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\n",
+ "\n",
+ "前3行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "最佳匹配: 患者姓名 (分数: 0.4840)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "最佳匹配: 患者性别 (分数: 0.5414)\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "最佳匹配: 患者年龄 (分数: 0.5357)\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 使用TF-IDF向量化文本\n",
+ " print(\"初始化TF-IDF向量化器...\")\n",
+ " \n",
+ " # 对中文文本进行分词处理\n",
+ " def tokenize_chinese(text):\n",
+ " return list(jieba.cut(text))\n",
+ " \n",
+ " # 初始化TF-IDF向量化器\n",
+ " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
+ " \n",
+ " # 定义TF-IDF相似度计算方法,只返回最佳匹配(分数最高的)\n",
+ " def calculate_best_tfidf_match(query, candidates, vectorizer):\n",
+ " \"\"\"计算TF-IDF相似度,只返回最佳匹配(分数最高的)\"\"\"\n",
+ " try:\n",
+ " # 将所有文本合并为一个列表进行向量化\n",
+ " all_texts = [query] + candidates\n",
+ " \n",
+ " # 拟合并转换所有文本\n",
+ " tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
+ " \n",
+ " # 计算查询与所有候选项的余弦相似度\n",
+ " query_vector = tfidf_matrix[0:1]\n",
+ " candidate_vectors = tfidf_matrix[1:]\n",
+ " \n",
+ " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
+ " \n",
+ " # 找到分数最高的匹配\n",
+ " best_index = np.argmax(cosine_scores)\n",
+ " best_score = cosine_scores[best_index]\n",
+ " \n",
+ " return best_index, best_score\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF相似度计算失败: {e}\")\n",
+ " return -1, 0.0\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_data = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " \n",
+ " for i in range(len(test_df)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " \n",
+ " # 使用TF-IDF相似度匹配,只获取最佳匹配\n",
+ " best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n",
+ " \n",
+ " # 获取测试数据的相关字段\n",
+ " row = test_df.iloc[i]\n",
+ " \n",
+ " # 创建结果字典,包含原始字段\n",
+ " result_dict = {\n",
+ " 'ParagraphName': row['ParagraphName'],\n",
+ " 'StatementName': row['StatementName'],\n",
+ " 'ValueItemName': row['ValueItemName'],\n",
+ " 'DisplayString': row['DisplayString']\n",
+ " }\n",
+ " \n",
+ " # 添加id字段(如果存在)\n",
+ " if 'Id' in test_df.columns:\n",
+ " result_dict['Id'] = row['Id']\n",
+ " \n",
+ " # 添加SFZH, XGRQ, IPBLH字段(如果存在)\n",
+ " if 'SFZH' in test_df.columns:\n",
+ " result_dict['SFZH'] = row['SFZH']\n",
+ " if 'XGRQ' in test_df.columns:\n",
+ " result_dict['XGRQ'] = row['XGRQ']\n",
+ " if 'IPBLH' in test_df.columns:\n",
+ " result_dict['IPBLH'] = row['IPBLH']\n",
+ " \n",
+ " # 添加最佳TF-IDF匹配结果\n",
+ " if best_index >= 0:\n",
+ " result_dict['TFIDF_规范节点名'] = regular_df.iloc[best_index]['节点名']\n",
+ " result_dict['TFIDF_规范注释'] = regular_df.iloc[best_index]['注释']\n",
+ " result_dict['TFIDF_规范说明'] = regular_df.iloc[best_index]['说明']\n",
+ " result_dict['TFIDF_相似度分数'] = best_score\n",
+ " else:\n",
+ " result_dict['TFIDF_规范节点名'] = ''\n",
+ " result_dict['TFIDF_规范注释'] = ''\n",
+ " result_dict['TFIDF_规范说明'] = ''\n",
+ " result_dict['TFIDF_相似度分数'] = 0.0\n",
+ " \n",
+ " # 添加到结果数据\n",
+ " result_data.append(result_dict)\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_df = pd.DataFrame(result_data)\n",
+ " \n",
+ " # 保存结果\n",
+ " result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv', \n",
+ " index=False, encoding=test_encoding)\n",
+ " \n",
+ " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n",
+ " print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\")\n",
+ " \n",
+ " # 打印前3行匹配结果示例\n",
+ " print(\"\\n前3行匹配结果示例:\")\n",
+ " for i in range(min(3, len(result_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"最佳匹配: {result_df.iloc[i]['TFIDF_规范注释']} (分数: {result_df.iloc[i]['TFIDF_相似度分数']:.4f})\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "测试文件编码: GB2312\n",
+ "规范文件编码: utf-8\n",
+ "文件成功读取!\n",
+ "\n",
+ "测试文件的列名:\n",
+ "['Id', 'PatientName', 'IPBLH', 'OPBLH', 'KH', 'KLX', 'SexId', 'CSRQ', 'ZJLX', 'SFZH', 'HYZK', 'ZYDM', 'GJDM', 'MZDM', 'JZDZ', 'YB', 'JG', 'HKDZ', 'DHHM', 'SJHM', 'ABOBloodTypeId', 'LXRXM', 'LXRGX', 'LXRDH', 'ZLLB', 'ZLMC', 'XGRQ', 'YJLXH', 'RYSJ', 'FolderName', 'Xh', 'RecordXh', 'FolderId', 'DocumentName', 'InstanceId', 'DocumentId', 'ParagraphId', 'ParagraphName', 'StatementId', 'StatementName', 'ValueId', 'ValueItemName', 'ValueItemKind', 'RealValue', 'ValueString', 'DisplayString', 'ValuePostfix', 'WSJLSCSJ', 'WSJLXGSJ', 'upload_time']\n",
+ "\n",
+ "初始化TF-IDF向量化器...\n",
+ "开始匹配注释...\n",
+ "处理第 0/127 条记录...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "处理第 100/127 条记录...\n",
+ "\n",
+ "匹配完成,共处理 127 条记录\n",
+ "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_完整字段_tfidf_match.csv\n",
+ "\n",
+ "结果文件的列名:\n",
+ "['Id', 'PatientName', 'IPBLH', 'OPBLH', 'KH', 'KLX', 'SexId', 'CSRQ', 'ZJLX', 'SFZH', 'HYZK', 'ZYDM', 'GJDM', 'MZDM', 'JZDZ', 'YB', 'JG', 'HKDZ', 'DHHM', 'SJHM', 'ABOBloodTypeId', 'LXRXM', 'LXRGX', 'LXRDH', 'ZLLB', 'ZLMC', 'XGRQ', 'YJLXH', 'RYSJ', 'FolderName', 'Xh', 'RecordXh', 'FolderId', 'DocumentName', 'InstanceId', 'DocumentId', 'ParagraphId', 'ParagraphName', 'StatementId', 'StatementName', 'ValueId', 'ValueItemName', 'ValueItemKind', 'RealValue', 'ValueString', 'DisplayString', 'ValuePostfix', 'WSJLSCSJ', 'WSJLXGSJ', 'upload_time', '规范节点名', '规范注释', '规范说明', 'processed_string']\n",
+ "\n",
+ "前3行匹配结果示例:\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
+ "最佳匹配: 患者姓名\n",
+ "处理后字符串: 测试\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
+ "最佳匹配: 患者性别\n",
+ "处理后字符串: 女\n",
+ "--------------------------------------------------\n",
+ "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
+ "最佳匹配: 患者年龄\n",
+ "处理后字符串: 22岁\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import chardet\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "import jieba\n",
+ "\n",
+ "# 首先检测文件的实际编码\n",
+ "def detect_encoding(file_path):\n",
+ " with open(file_path, 'rb') as f:\n",
+ " result = chardet.detect(f.read())\n",
+ " return result['encoding']\n",
+ "\n",
+ "# 检测文件编码\n",
+ "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
+ "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
+ "\n",
+ "test_encoding = detect_encoding(test_file)\n",
+ "regular_encoding = detect_encoding(regular_file)\n",
+ "\n",
+ "print(f\"测试文件编码: {test_encoding}\")\n",
+ "print(f\"规范文件编码: {regular_encoding}\")\n",
+ "\n",
+ "# 尝试使用检测到的编码读取文件\n",
+ "try:\n",
+ " # 读取规范文件\n",
+ " regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
+ " \n",
+ " # 读取测试数据\n",
+ " test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
+ " \n",
+ " print(\"文件成功读取!\")\n",
+ "except Exception as e:\n",
+ " print(f\"使用检测到的编码读取失败: {e}\")\n",
+ " \n",
+ " # 尝试其他常见编码\n",
+ " encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
+ " \n",
+ " for enc in encodings:\n",
+ " try:\n",
+ " print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
+ " test_df = pd.read_csv(test_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取测试文件\")\n",
+ " \n",
+ " print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
+ " regular_df = pd.read_csv(regular_file, encoding=enc)\n",
+ " print(f\"成功使用 {enc} 读取规范文件\")\n",
+ " \n",
+ " test_encoding = enc\n",
+ " regular_encoding = enc\n",
+ " break\n",
+ " except Exception as e:\n",
+ " print(f\"使用 {enc} 读取失败: {e}\")\n",
+ "\n",
+ "# 如果成功读取文件,继续处理\n",
+ "if 'test_df' in locals() and 'regular_df' in locals():\n",
+ " # 打印测试文件的列名,以供参考\n",
+ " print(\"\\n测试文件的列名:\")\n",
+ " print(test_df.columns.tolist())\n",
+ " \n",
+ " # 创建规范字典,键为注释,值为对应的规则\n",
+ " regular_annotations = regular_df['注释'].tolist()\n",
+ " \n",
+ " # 准备测试数据中的字段组合\n",
+ " combined_fields = []\n",
+ " \n",
+ " for _, row in test_df.iterrows():\n",
+ " combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
+ " combined_fields.append(combined_field)\n",
+ " \n",
+ " # 使用TF-IDF向量化文本\n",
+ " print(\"\\n初始化TF-IDF向量化器...\")\n",
+ " \n",
+ " # 对中文文本进行分词处理\n",
+ " def tokenize_chinese(text):\n",
+ " return list(jieba.cut(text))\n",
+ " \n",
+ " # 初始化TF-IDF向量化器\n",
+ " tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
+ " \n",
+ " # 定义TF-IDF相似度计算方法,只返回最佳匹配(分数最高的)\n",
+ " def calculate_best_tfidf_match(query, candidates, vectorizer):\n",
+ " \"\"\"计算TF-IDF相似度,只返回最佳匹配(分数最高的)\"\"\"\n",
+ " try:\n",
+ " # 将所有文本合并为一个列表进行向量化\n",
+ " all_texts = [query] + candidates\n",
+ " \n",
+ " # 拟合并转换所有文本\n",
+ " tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
+ " \n",
+ " # 计算查询与所有候选项的余弦相似度\n",
+ " query_vector = tfidf_matrix[0:1]\n",
+ " candidate_vectors = tfidf_matrix[1:]\n",
+ " \n",
+ " cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
+ " \n",
+ " # 找到分数最高的匹配\n",
+ " best_index = np.argmax(cosine_scores)\n",
+ " best_score = cosine_scores[best_index]\n",
+ " \n",
+ " return best_index, best_score\n",
+ " except Exception as e:\n",
+ " print(f\"TF-IDF相似度计算失败: {e}\")\n",
+ " return -1, 0.0\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_data = []\n",
+ " \n",
+ " print(\"开始匹配注释...\")\n",
+ " \n",
+ " for i in range(len(test_df)):\n",
+ " if i % 100 == 0:\n",
+ " print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
+ " \n",
+ " query = combined_fields[i]\n",
+ " \n",
+ " # 使用TF-IDF相似度匹配,只获取最佳匹配\n",
+ " best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n",
+ " \n",
+ " # 获取测试数据的行\n",
+ " row = test_df.iloc[i]\n",
+ " \n",
+ " # 创建结果字典,包含测试数据的所有字段\n",
+ " result_dict = row.to_dict()\n",
+ " \n",
+ " # 添加最佳TF-IDF匹配结果\n",
+ " if best_index >= 0:\n",
+ " result_dict['规范节点名'] = regular_df.iloc[best_index]['节点名']\n",
+ " result_dict['规范注释'] = regular_df.iloc[best_index]['注释']\n",
+ " result_dict['规范说明'] = regular_df.iloc[best_index]['说明']\n",
+ " \n",
+ " \n",
+ " # 从最佳匹配中提取processed_string值\n",
+ " # 如果需要对DisplayString进行处理,可以在此添加逻辑\n",
+ " result_dict['processed_string'] = row['DisplayString']\n",
+ " else:\n",
+ " result_dict['规范节点名'] = ''\n",
+ " result_dict['规范注释'] = ''\n",
+ " result_dict['规范说明'] = ''\n",
+ " \n",
+ " result_dict['processed_string'] = ''\n",
+ " \n",
+ " # 添加到结果数据\n",
+ " result_data.append(result_dict)\n",
+ " \n",
+ " # 创建结果DataFrame\n",
+ " result_df = pd.DataFrame(result_data)\n",
+ " \n",
+ " # 重新排列列顺序,将匹配结果放在后面\n",
+ " all_columns = test_df.columns.tolist() + ['规范节点名', '规范注释', '规范说明', 'processed_string']\n",
+ " result_df = result_df[all_columns]\n",
+ " \n",
+ " # 保存结果\n",
+ " result_file = '/home/limeng/SICT/lung_test/result/喉癌患者测试样例_完整字段_tfidf_match.csv'\n",
+ " result_df.to_csv(result_file, index=False, encoding=test_encoding)\n",
+ " \n",
+ " print(f\"\\n匹配完成,共处理 {len(test_df)} 条记录\")\n",
+ " print(f\"结果已保存至: {result_file}\")\n",
+ " \n",
+ " # 打印结果DataFrame的列名\n",
+ " print(\"\\n结果文件的列名:\")\n",
+ " print(result_df.columns.tolist())\n",
+ " \n",
+ " # 打印前3行匹配结果示例\n",
+ " print(\"\\n前3行匹配结果示例:\")\n",
+ " for i in range(min(3, len(result_df))):\n",
+ " print(f\"原始字段: {combined_fields[i]}\")\n",
+ " print(f\"最佳匹配: {result_df.iloc[i]['规范注释']}\")\n",
+ " print(f\"处理后字符串: {result_df.iloc[i]['processed_string']}\")\n",
+ " print(\"-\" * 50)\n",
+ "else:\n",
+ " print(\"无法读取文件,请手动检查文件编码\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n",
+ "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
+ "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n",
+ "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n",
+ "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n",
+ "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n",
+ "Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00, ?it/s]\n",
+ "Loading safetensors checkpoint shards: 20% Completed | 1/5 [00:00<00:03, 1.24it/s]\n",
+ "Loading safetensors checkpoint shards: 40% Completed | 2/5 [00:03<00:04, 1.64s/it]\n",
+ "Loading safetensors checkpoint shards: 60% Completed | 3/5 [00:05<00:03, 1.92s/it]\n",
+ "Loading safetensors checkpoint shards: 80% Completed | 4/5 [00:08<00:02, 2.25s/it]\n",
+ "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00, 2.26s/it]\n",
+ "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00, 2.06s/it]\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "INFO 03-10 09:55:32 model_runner.py:1077] Loading model weights took 15.5028 GB\n",
+ "INFO 03-10 09:55:42 worker.py:232] Memory profiling results: total_gpu_memory=31.73GiB initial_memory_usage=15.89GiB peak_torch_memory=19.42GiB memory_usage_post_profile=15.96GiB non_torch_memory=0.45GiB kv_cache_size=8.69GiB gpu_memory_utilization=0.90\n",
+ "INFO 03-10 09:55:43 gpu_executor.py:113] # GPU blocks: 2965, # CPU blocks: 1365\n",
+ "INFO 03-10 09:55:43 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 1.45x\n",
+ "INFO 03-10 09:55:46 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+ "INFO 03-10 09:55:46 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+ "INFO 03-10 09:56:08 model_runner.py:1518] Graph capturing finished in 22 secs, took 0.59 GiB\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import json\n",
+ "# import time\n",
+ "os.environ[\"NCCL_P2P_DISABLE\"] = \"1\"\n",
+ "import torch \n",
+ "# import pynvml\n",
+ "from vllm import LLM, SamplingParams\n",
+ "import torch.distributed as dist\n",
+ "import re\n",
+ "torch.cuda.empty_cache()\n",
+ "os.environ[\"TRANSFORMERS_OFFLINE\"] = \"1\"\n",
+ "os.environ[\"HF_DATASETS_OFFLINE\"] = \"1\"\n",
+ "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
+ "\n",
+ "\n",
+ "\n",
+ "# 加载量化模型\n",
+ "llm = LLM(model=\"/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8\", dtype=\"half\",gpu_memory_utilization=0.9)\n",
+ "\n",
+ "# \n",
+ "sampling_params = SamplingParams(temperature=0.1, top_p=0.3, max_tokens = 256)\n",
+ "\n",
+ "# 推理示例\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Qwen2.5",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--
2.22.0