{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>节点名</th>\n",
       "      <th>表名</th>\n",
       "      <th>节点类型</th>\n",
       "      <th>宽度</th>\n",
       "      <th>是否必传</th>\n",
       "      <th>注释</th>\n",
       "      <th>说明</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>COC_HBZL_RYJL$kh</td>\n",
       "      <td>入院记录</td>\n",
       "      <td>字符</td>\n",
       "      <td>32</td>\n",
       "      <td>Y</td>\n",
       "      <td>卡号</td>\n",
       "      <td>患者就诊卡卡号</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>COC_HBZL_RYJL$klx</td>\n",
       "      <td>入院记录</td>\n",
       "      <td>字符</td>\n",
       "      <td>16</td>\n",
       "      <td>Y</td>\n",
       "      <td>卡类型</td>\n",
       "      <td>参见字典表</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>COC_HBZL_RYJL$xgbz</td>\n",
       "      <td>入院记录</td>\n",
       "      <td>字符</td>\n",
       "      <td>1</td>\n",
       "      <td>Y</td>\n",
       "      <td>修改标志</td>\n",
       "      <td>1:正常 2:修改3:撤销</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>COC_HBZL_RYJL$yjlxh</td>\n",
       "      <td>入院记录</td>\n",
       "      <td>字符</td>\n",
       "      <td>32</td>\n",
       "      <td>Y</td>\n",
       "      <td>原纪录序号</td>\n",
       "      <td>院内唯一标识</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>COC_HBZL_RYJL$hzbh</td>\n",
       "      <td>入院记录</td>\n",
       "      <td>varchar</td>\n",
       "      <td>64</td>\n",
       "      <td>Y</td>\n",
       "      <td>患者编号</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1137</th>\n",
       "      <td>COC_HBZL_SFJL$hgnpg</td>\n",
       "      <td>随访记录</td>\n",
       "      <td>varchar</td>\n",
       "      <td>500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>喉功能评估</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1138</th>\n",
       "      <td>COC_HBZL_SFJL$zhzlsj</td>\n",
       "      <td>随访记录</td>\n",
       "      <td>varchar</td>\n",
       "      <td>20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>综合治疗时间</td>\n",
       "      <td>【天/月/年】后</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1139</th>\n",
       "      <td>COC_HBZL_SFJL$zhzlfa</td>\n",
       "      <td>随访记录</td>\n",
       "      <td>varchar</td>\n",
       "      <td>500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>综合治疗方案</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1140</th>\n",
       "      <td>COC_HBZL_SFJL$sxz</td>\n",
       "      <td>随访记录</td>\n",
       "      <td>varchar</td>\n",
       "      <td>50</td>\n",
       "      <td>NaN</td>\n",
       "      <td>书写者</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1141</th>\n",
       "      <td>COC_HBZL_SFJL$cjsj</td>\n",
       "      <td>随访记录</td>\n",
       "      <td>varchar</td>\n",
       "      <td>20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>创建时间</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1142 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       节点名    表名     节点类型   宽度 是否必传      注释             说明\n",
       "0         COC_HBZL_RYJL$kh  入院记录       字符   32    Y      卡号        患者就诊卡卡号\n",
       "1        COC_HBZL_RYJL$klx  入院记录       字符   16    Y     卡类型          参见字典表\n",
       "2       COC_HBZL_RYJL$xgbz  入院记录       字符    1    Y    修改标志  1:正常 2:修改3:撤销\n",
       "3      COC_HBZL_RYJL$yjlxh  入院记录       字符   32    Y   原纪录序号         院内唯一标识\n",
       "4       COC_HBZL_RYJL$hzbh  入院记录  varchar   64    Y    患者编号            NaN\n",
       "...                    ...   ...      ...  ...  ...     ...            ...\n",
       "1137   COC_HBZL_SFJL$hgnpg  随访记录  varchar  500  NaN   喉功能评估            NaN\n",
       "1138  COC_HBZL_SFJL$zhzlsj  随访记录  varchar   20  NaN  综合治疗时间       【天/月/年】后\n",
       "1139  COC_HBZL_SFJL$zhzlfa  随访记录  varchar  500  NaN  综合治疗方案            NaN\n",
       "1140     COC_HBZL_SFJL$sxz  随访记录  varchar   50  NaN     书写者            NaN\n",
       "1141    COC_HBZL_SFJL$cjsj  随访记录  varchar   20  NaN    创建时间            NaN\n",
       "\n",
       "[1142 rows x 7 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel('/home/limeng/SICT/lung_test/数据采集接口规范（喉癌）.xlsx')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('regular.csv',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "匹配结果统计:\n",
      "未匹配    127\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import difflib\n",
    "\n",
    "def get_best_match(target, choices):\n",
    "    \"\"\"使用difflib找到最佳匹配\"\"\"\n",
    "    matches = difflib.get_close_matches(target, choices, n=1, cutoff=0.6)\n",
    "    return matches[0] if matches else None\n",
    "\n",
    "# 读取规范文件\n",
    "regular_df = pd.read_csv('/home/limeng/SICT/lung_test/regular.csv')\n",
    "\n",
    "# 读取测试数据\n",
    "test_df = pd.read_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例.csv', encoding='ISO-8859-1')\n",
    "\n",
    "# 创建规范字典，键为注释，值为对应的规则\n",
    "regular_dict = dict(zip(regular_df['注释'], regular_df.to_dict('records')))\n",
    "\n",
    "# 创建新的注释列\n",
    "matched_annotations = []\n",
    "for _, row in test_df.iterrows():\n",
    "    # 组合三个字段\n",
    "    combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "    \n",
    "    # 在规范中查找最佳匹配\n",
    "    best_match = get_best_match(combined_field, regular_dict.keys())\n",
    "    matched_annotations.append(best_match if best_match else \"未匹配\")\n",
    "\n",
    "# 获取ValueItemKind列的位置\n",
    "kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
    "\n",
    "# 在ValueItemKind列前插入新的注释列\n",
    "test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n",
    "\n",
    "# 保存结果\n",
    "test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations.csv', index=False)\n",
    "\n",
    "# 打印匹配结果统计\n",
    "print(\"\\n匹配结果统计:\")\n",
    "print(pd.Series(matched_annotations).value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "\n",
      "匹配结果统计:\n",
      "未匹配          123\n",
      "现病史-精神状态       1\n",
      "体格检查-精神状态      1\n",
      "体格检查-呼吸        1\n",
      "体格检查-查体        1\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import difflib\n",
    "import chardet\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    def get_best_match(target, choices):\n",
    "        \"\"\"使用difflib找到最佳匹配\"\"\"\n",
    "        matches = difflib.get_close_matches(target, choices, n=1, cutoff=0.6)\n",
    "        return matches[0] if matches else None\n",
    "\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_dict = dict(zip(regular_df['注释'], regular_df.to_dict('records')))\n",
    "\n",
    "    # 创建新的注释列\n",
    "    matched_annotations = []\n",
    "    for _, row in test_df.iterrows():\n",
    "        # 组合三个字段\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        \n",
    "        # 在规范中查找最佳匹配\n",
    "        best_match = get_best_match(combined_field, regular_dict.keys())\n",
    "        matched_annotations.append(best_match if best_match else \"未匹配\")\n",
    "\n",
    "    # 获取ValueItemKind列的位置\n",
    "    kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
    "\n",
    "    # 在ValueItemKind列前插入新的注释列\n",
    "    test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n",
    "\n",
    "    # 保存结果\n",
    "    test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations.csv', \n",
    "                   index=False, encoding=test_encoding)\n",
    "\n",
    "    # 打印匹配结果统计\n",
    "    print(\"\\n匹配结果统计:\")\n",
    "    print(pd.Series(matched_annotations).value_counts())\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "加载Sentence Transformer模型...\n",
      "计算规范注释的嵌入向量...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 36/36 [00:01<00:00, 24.81it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "开始匹配注释...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 4/4 [00:00<00:00, 113.98it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "匹配结果统计:\n",
      "未匹配                    14\n",
      "手术类型                   10\n",
      "皮肤黏膜-皮下出血               9\n",
      "辅助检查-乙肝病毒-医院名称          9\n",
      "患者基本情况                  5\n",
      "主任医师签名日期                5\n",
      "手术医师签名                  5\n",
      "下一步治疗方案-具体方案            5\n",
      "发起科室参与人员-主任医师           5\n",
      "一般情况-主要症状及体征-姓名         5\n",
      "一般情况-发育                 3\n",
      "中断放疗-是否                 3\n",
      "个人史-疫源接触史-接触时间          3\n",
      "甲状腺-左侧甲状腺包块-有无          3\n",
      "第一助手                    2\n",
      "第二助手                    2\n",
      "出院时情况                   2\n",
      "以下血管结构可见肿瘤包饶-颈总动脉       2\n",
      "现病史-精神状态                2\n",
      "入院时情况-主诉                2\n",
      "病理报告-检查日期               2\n",
      "入院时情况-患者年龄              2\n",
      "讨论经过-病理科-医师姓名           2\n",
      "系统回顾-运动骨骼系统-关节肿痛-时长     2\n",
      "辅助检查-心电图-检查结论           2\n",
      "颈部-颈部气管切开-硅胶气管筒         2\n",
      "既往史-过敏史-药物食物过敏源         1\n",
      "一般情况-神志                 1\n",
      "现病史-大便                  1\n",
      "系统回顾-泌尿系统-排尿困难          1\n",
      "系统回顾-神经精神系统-癫痫          1\n",
      "脊柱四肢-关节活动               1\n",
      "咽-喉咽-喉咽后壁新生物-形态         1\n",
      "讨论经过-病理科-病理结果           1\n",
      "环后区-其他描述                1\n",
      "系统回顾-泌尿系统-排尿困难-服用药物     1\n",
      "系统回顾-血液系统-鼻衄史-目前清理      1\n",
      "记录医师签名日期                1\n",
      "一般情况-主要症状及体征-主诉         1\n",
      "喉部增强CT-CT号              1\n",
      "术前常规化验-化验日期             1\n",
      "主治医师签名                  1\n",
      "主任医师签名                  1\n",
      "月经史-月经周期                1\n",
      "术前常规化验-化验单位             1\n",
      "Name: count, dtype: int64\n",
      "\n",
      "前5行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "匹配注释: 讨论经过-病理科-医师姓名\n",
      "相似度分数: 0.8398\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "匹配注释: 辅助检查-乙肝病毒-医院名称\n",
      "相似度分数: 0.8360\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "匹配注释: 入院时情况-患者年龄\n",
      "相似度分数: 0.8390\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-主诉-主诉\n",
      "匹配注释: 入院时情况-主诉\n",
      "相似度分数: 0.9434\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-入院日期-入院日期\n",
      "匹配注释: 病理报告-检查日期\n",
      "相似度分数: 0.9574\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sentence_transformers import SentenceTransformer\n",
    "import chardet\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 加载预训练的中文Sentence Transformer模型\n",
    "    print(\"加载Sentence Transformer模型...\")\n",
    "    model = SentenceTransformer('/home/limeng/SICT/lung_test/all-MiniLM-L6-v2')  # 多语言模型，支持中文\n",
    "    \n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 计算规范注释的嵌入向量\n",
    "    print(\"计算规范注释的嵌入向量...\")\n",
    "    regular_embeddings = model.encode(regular_annotations, show_progress_bar=True)\n",
    "    \n",
    "    # 创建新的注释列\n",
    "    matched_annotations = []\n",
    "    matched_scores = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    # 批量处理测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 计算测试数据的嵌入向量\n",
    "    test_embeddings = model.encode(combined_fields, show_progress_bar=True)\n",
    "    \n",
    "    # 计算相似度并找到最佳匹配\n",
    "    for i, test_embedding in enumerate(test_embeddings):\n",
    "        # 计算与所有规范注释的余弦相似度\n",
    "        similarities = np.dot(regular_embeddings, test_embedding) / (\n",
    "            np.linalg.norm(regular_embeddings, axis=1) * np.linalg.norm(test_embedding)\n",
    "        )\n",
    "        \n",
    "        # 找到最佳匹配\n",
    "        best_match_idx = np.argmax(similarities)\n",
    "        best_match_score = similarities[best_match_idx]\n",
    "        \n",
    "        # 如果相似度低于阈值，标记为未匹配\n",
    "        if best_match_score < 0.5:  # 可以调整这个阈值\n",
    "            matched_annotations.append(\"未匹配\")\n",
    "            matched_scores.append(0.0)\n",
    "        else:\n",
    "            matched_annotations.append(regular_annotations[best_match_idx])\n",
    "            matched_scores.append(best_match_score)\n",
    "    \n",
    "    # 获取ValueItemKind列的位置\n",
    "    kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
    "    \n",
    "    # 在ValueItemKind列前插入新的注释列和相似度分数列\n",
    "    test_df.insert(kind_idx, 'Matched_Score', matched_scores)\n",
    "    test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n",
    "    \n",
    "    # 保存结果\n",
    "    test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_transformer.csv', \n",
    "                   index=False, encoding=test_encoding)\n",
    "    \n",
    "    # 打印匹配结果统计\n",
    "    print(\"\\n匹配结果统计:\")\n",
    "    print(pd.Series(matched_annotations).value_counts())\n",
    "    \n",
    "    # 打印前5行匹配结果示例\n",
    "    print(\"\\n前5行匹配结果示例:\")\n",
    "    for i in range(min(5, len(test_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"匹配注释: {test_df.iloc[i]['Matched_Annotation']}\")\n",
    "        print(f\"相似度分数: {test_df.iloc[i]['Matched_Score']:.4f}\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Dumping model to file cache /tmp/jieba.cache\n",
      "Loading model cost 0.792 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理第 100/127 条记录...\n",
      "\n",
      "匹配结果统计:\n",
      "体格检查    20\n",
      "未匹配     20\n",
      "手术经过    13\n",
      "姓名      11\n",
      "主刀医师    10\n",
      "现病史      7\n",
      "性别       5\n",
      "手术名称     5\n",
      "小结时间     5\n",
      "麻醉方式     5\n",
      "Name: count, dtype: int64\n",
      "\n",
      "匹配方法使用统计:\n",
      "Partial        100\n",
      "None            20\n",
      "Levenshtein      5\n",
      "TF-IDF           2\n",
      "Name: count, dtype: int64\n",
      "\n",
      "前5行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "匹配注释: 姓名\n",
      "相似度分数: 1.0000\n",
      "匹配方法: Partial\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "匹配注释: 姓名\n",
      "相似度分数: 1.0000\n",
      "匹配方法: Partial\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "匹配注释: 姓名\n",
      "相似度分数: 1.0000\n",
      "匹配方法: Partial\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-主诉-主诉\n",
      "匹配注释: 主诉\n",
      "相似度分数: 1.0000\n",
      "匹配方法: Partial\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-入院日期-入院日期\n",
      "匹配注释: 病理报告-检查日期\n",
      "相似度分数: 0.6774\n",
      "匹配方法: Levenshtein\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "import Levenshtein\n",
    "from fuzzywuzzy import fuzz\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 定义多种相似度计算方法\n",
    "    def calculate_similarities(query, candidates):\n",
    "        \"\"\"计算多种相似度指标\"\"\"\n",
    "        results = []\n",
    "        \n",
    "        # 1. TF-IDF + 余弦相似度\n",
    "        try:\n",
    "            # 对中文文本进行分词\n",
    "            segmented_query = ' '.join(jieba.cut(query))\n",
    "            segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
    "            \n",
    "            # 计算TF-IDF向量\n",
    "            vectorizer = TfidfVectorizer()\n",
    "            tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
    "            \n",
    "            # 计算余弦相似度\n",
    "            cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
    "            \n",
    "            # 找到最佳匹配\n",
    "            best_idx_tfidf = np.argmax(cosine_sim)\n",
    "            best_score_tfidf = cosine_sim[best_idx_tfidf]\n",
    "            results.append((candidates[best_idx_tfidf], best_score_tfidf, \"TF-IDF\"))\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF计算失败: {e}\")\n",
    "        \n",
    "        # 2. Levenshtein距离（编辑距离）\n",
    "        try:\n",
    "            lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n",
    "            # 将距离转换为相似度分数（越小越相似）\n",
    "            max_len = max(len(query), max(len(c) for c in candidates))\n",
    "            lev_similarities = [1 - dist/max_len for dist in lev_distances]\n",
    "            \n",
    "            best_idx_lev = np.argmax(lev_similarities)\n",
    "            best_score_lev = lev_similarities[best_idx_lev]\n",
    "            results.append((candidates[best_idx_lev], best_score_lev, \"Levenshtein\"))\n",
    "        except Exception as e:\n",
    "            print(f\"Levenshtein计算失败: {e}\")\n",
    "        \n",
    "        # 3. FuzzyWuzzy比率\n",
    "        try:\n",
    "            fuzzy_ratios = [fuzz.ratio(query, c)/100 for c in candidates]\n",
    "            best_idx_fuzzy = np.argmax(fuzzy_ratios)\n",
    "            best_score_fuzzy = fuzzy_ratios[best_idx_fuzzy]\n",
    "            results.append((candidates[best_idx_fuzzy], best_score_fuzzy, \"FuzzyWuzzy\"))\n",
    "        except Exception as e:\n",
    "            print(f\"FuzzyWuzzy计算失败: {e}\")\n",
    "        \n",
    "        # 4. FuzzyWuzzy部分比率（处理子字符串）\n",
    "        try:\n",
    "            partial_ratios = [fuzz.partial_ratio(query, c)/100 for c in candidates]\n",
    "            best_idx_partial = np.argmax(partial_ratios)\n",
    "            best_score_partial = partial_ratios[best_idx_partial]\n",
    "            results.append((candidates[best_idx_partial], best_score_partial, \"Partial\"))\n",
    "        except Exception as e:\n",
    "            print(f\"Partial比率计算失败: {e}\")\n",
    "        \n",
    "        # 5. FuzzyWuzzy令牌排序比率（处理词序不同）\n",
    "        try:\n",
    "            token_sort_ratios = [fuzz.token_sort_ratio(query, c)/100 for c in candidates]\n",
    "            best_idx_token = np.argmax(token_sort_ratios)\n",
    "            best_score_token = token_sort_ratios[best_idx_token]\n",
    "            results.append((candidates[best_idx_token], best_score_token, \"TokenSort\"))\n",
    "        except Exception as e:\n",
    "            print(f\"TokenSort比率计算失败: {e}\")\n",
    "        \n",
    "        # 找出所有方法中得分最高的结果\n",
    "        best_result = max(results, key=lambda x: x[1]) if results else (None, 0, None)\n",
    "        \n",
    "        return best_result\n",
    "    \n",
    "    # 对每个测试字段进行匹配\n",
    "    matched_annotations = []\n",
    "    matched_scores = []\n",
    "    matched_methods = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    for i, query in enumerate(combined_fields):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n",
    "        \n",
    "        # 计算多种相似度并选择最佳匹配\n",
    "        best_match, best_score, best_method = calculate_similarities(query, regular_annotations)\n",
    "        \n",
    "        # 如果相似度低于阈值，标记为未匹配\n",
    "        if best_score < 0.6:  # 可以调整这个阈值\n",
    "            matched_annotations.append(\"未匹配\")\n",
    "            matched_scores.append(0.0)\n",
    "            matched_methods.append(\"None\")\n",
    "        else:\n",
    "            matched_annotations.append(best_match)\n",
    "            matched_scores.append(best_score)\n",
    "            matched_methods.append(best_method)\n",
    "    \n",
    "    # 获取ValueItemKind列的位置\n",
    "    kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
    "    \n",
    "    # 在ValueItemKind列前插入新的列\n",
    "    test_df.insert(kind_idx, 'Matched_Method', matched_methods)\n",
    "    test_df.insert(kind_idx, 'Matched_Score', matched_scores)\n",
    "    test_df.insert(kind_idx, 'Matched_Annotation', matched_annotations)\n",
    "    \n",
    "    # 保存结果\n",
    "    test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_multi.csv', \n",
    "                   index=False, encoding=test_encoding)\n",
    "    \n",
    "    # 打印匹配结果统计\n",
    "    print(\"\\n匹配结果统计:\")\n",
    "    print(pd.Series(matched_annotations).value_counts().head(10))\n",
    "    \n",
    "    # 打印方法使用统计\n",
    "    print(\"\\n匹配方法使用统计:\")\n",
    "    print(pd.Series(matched_methods).value_counts())\n",
    "    \n",
    "    # 打印前5行匹配结果示例\n",
    "    print(\"\\n前5行匹配结果示例:\")\n",
    "    for i in range(min(5, len(test_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"匹配注释: {test_df.iloc[i]['Matched_Annotation']}\")\n",
    "        print(f\"相似度分数: {test_df.iloc[i]['Matched_Score']:.4f}\")\n",
    "        print(f\"匹配方法: {test_df.iloc[i]['Matched_Method']}\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n",
      "处理第 100/127 条记录...\n",
      "\n",
      "各方法匹配结果统计:\n",
      "TF-IDF匹配结果:\n",
      "主刀医师                  15\n",
      "卡号                    14\n",
      "手术经过                  13\n",
      "既往史-手术外伤史-手术史-手术时间    10\n",
      "患者姓名                   7\n",
      "患者基本情况                 6\n",
      "手术名称                   5\n",
      "麻醉方式                   5\n",
      "参与人员                   4\n",
      "主诉                     3\n",
      "Name: count, dtype: int64\n",
      "\n",
      "Levenshtein匹配结果:\n",
      "既往史-手术外伤史-手术史-有无      13\n",
      "体格检查-血压-收缩压           10\n",
      "既往史-手术外伤史-手术史-手术时间    10\n",
      "入院时情况-患者姓名             8\n",
      "主要辅助检查-实验室检查-Na        8\n",
      "主治医师签名                 6\n",
      "既往史-手术外伤史-手术史-手术名称     5\n",
      "患者性别                   5\n",
      "主治医师签名时间               5\n",
      "个人史-饮酒史-主要饮酒种类         5\n",
      "Name: count, dtype: int64\n",
      "\n",
      "FuzzyWuzzy匹配结果:\n",
      "讨论经过-耳鼻喉科/眼科-具体手术方案    13\n",
      "患者姓名                   11\n",
      "既往史-手术外伤史-手术史-手术时间     10\n",
      "体格检查-血压-收缩压             9\n",
      "主要辅助检查-实验室检查-Na         8\n",
      "主治医师签名                  6\n",
      "发起科室参与人员-主任医师           5\n",
      "麻醉方式                    5\n",
      "患者性别                    5\n",
      "既往史-手术外伤史-手术史-手术名称      5\n",
      "Name: count, dtype: int64\n",
      "\n",
      "前5行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "TF-IDF匹配: 患者姓名\n",
      "Levenshtein匹配: 患者姓名\n",
      "FuzzyWuzzy匹配: 患者姓名\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "TF-IDF匹配: 患者性别\n",
      "Levenshtein匹配: 患者姓名\n",
      "FuzzyWuzzy匹配: 患者姓名\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "TF-IDF匹配: 患者年龄\n",
      "Levenshtein匹配: 患者姓名\n",
      "FuzzyWuzzy匹配: 患者姓名\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-主诉-主诉\n",
      "TF-IDF匹配: 主诉\n",
      "Levenshtein匹配: 入院时情况-主诉\n",
      "FuzzyWuzzy匹配: 一般情况-主要症状及体征-主诉\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-入院日期-入院日期\n",
      "TF-IDF匹配: 入院时情况-入院时间\n",
      "Levenshtein匹配: 病理报告-检查日期\n",
      "FuzzyWuzzy匹配: 现病史-外院手术日期\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "import Levenshtein\n",
    "from fuzzywuzzy import fuzz\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 定义多种相似度计算方法\n",
    "    def calculate_similarities(query, candidates):\n",
    "        \"\"\"计算多种相似度指标，返回每种方法的最佳匹配\"\"\"\n",
    "        results = {}\n",
    "        \n",
    "        # 1. TF-IDF + 余弦相似度\n",
    "        try:\n",
    "            # 对中文文本进行分词\n",
    "            segmented_query = ' '.join(jieba.cut(query))\n",
    "            segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
    "            \n",
    "            # 计算TF-IDF向量\n",
    "            vectorizer = TfidfVectorizer()\n",
    "            tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
    "            \n",
    "            # 计算余弦相似度\n",
    "            cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
    "            \n",
    "            # 找到最佳匹配\n",
    "            best_idx_tfidf = np.argmax(cosine_sim)\n",
    "            results['TF-IDF'] = candidates[best_idx_tfidf]\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF计算失败: {e}\")\n",
    "            results['TF-IDF'] = \"未匹配\"\n",
    "        \n",
    "        # 2. Levenshtein距离（编辑距离）\n",
    "        try:\n",
    "            lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n",
    "            # 将距离转换为相似度分数（越小越相似）\n",
    "            best_idx_lev = np.argmin(lev_distances)\n",
    "            results['Levenshtein'] = candidates[best_idx_lev]\n",
    "        except Exception as e:\n",
    "            print(f\"Levenshtein计算失败: {e}\")\n",
    "            results['Levenshtein'] = \"未匹配\"\n",
    "        \n",
    "        # 3. FuzzyWuzzy比率\n",
    "        try:\n",
    "            fuzzy_ratios = [fuzz.ratio(query, c) for c in candidates]\n",
    "            best_idx_fuzzy = np.argmax(fuzzy_ratios)\n",
    "            results['FuzzyWuzzy'] = candidates[best_idx_fuzzy]\n",
    "        except Exception as e:\n",
    "            print(f\"FuzzyWuzzy计算失败: {e}\")\n",
    "            results['FuzzyWuzzy'] = \"未匹配\"\n",
    "        \n",
    "        # 4. FuzzyWuzzy部分比率（处理子字符串）\n",
    "        try:\n",
    "            partial_ratios = [fuzz.partial_ratio(query, c) for c in candidates]\n",
    "            best_idx_partial = np.argmax(partial_ratios)\n",
    "            results['Partial'] = candidates[best_idx_partial]\n",
    "        except Exception as e:\n",
    "            print(f\"Partial比率计算失败: {e}\")\n",
    "            results['Partial'] = \"未匹配\"\n",
    "        \n",
    "        # 5. FuzzyWuzzy令牌排序比率（处理词序不同）\n",
    "        try:\n",
    "            token_sort_ratios = [fuzz.token_sort_ratio(query, c) for c in candidates]\n",
    "            best_idx_token = np.argmax(token_sort_ratios)\n",
    "            results['TokenSort'] = candidates[best_idx_token]\n",
    "        except Exception as e:\n",
    "            print(f\"TokenSort比率计算失败: {e}\")\n",
    "            results['TokenSort'] = \"未匹配\"\n",
    "        \n",
    "        return results\n",
    "    \n",
    "    # 对每个测试字段进行匹配\n",
    "    tfidf_matches = []\n",
    "    levenshtein_matches = []\n",
    "    fuzzywuzzy_matches = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    for i, query in enumerate(combined_fields):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n",
    "        \n",
    "        # 计算多种相似度\n",
    "        matches = calculate_similarities(query, regular_annotations)\n",
    "        \n",
    "        # 保存各种方法的匹配结果\n",
    "        tfidf_matches.append(matches.get('TF-IDF', \"未匹配\"))\n",
    "        levenshtein_matches.append(matches.get('Levenshtein', \"未匹配\"))\n",
    "        fuzzywuzzy_matches.append(matches.get('FuzzyWuzzy', \"未匹配\"))\n",
    "    \n",
    "    # 获取ValueItemKind列的位置\n",
    "    kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
    "    \n",
    "    # 在ValueItemKind列前插入新的列\n",
    "    test_df.insert(kind_idx, 'FuzzyWuzzy_Match', fuzzywuzzy_matches)\n",
    "    test_df.insert(kind_idx, 'Levenshtein_Match', levenshtein_matches)\n",
    "    test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n",
    "    \n",
    "    # 保存结果\n",
    "    test_df.to_csv('/home/limeng/SICT/lung_test/喉癌患者测试样例_with_annotations_all.csv', \n",
    "                   index=False, encoding=test_encoding)\n",
    "    \n",
    "    # 打印匹配结果统计\n",
    "    print(\"\\n各方法匹配结果统计:\")\n",
    "    print(\"TF-IDF匹配结果:\")\n",
    "    print(pd.Series(tfidf_matches).value_counts().head(10))\n",
    "    print(\"\\nLevenshtein匹配结果:\")\n",
    "    print(pd.Series(levenshtein_matches).value_counts().head(10))\n",
    "    print(\"\\nFuzzyWuzzy匹配结果:\")\n",
    "    print(pd.Series(fuzzywuzzy_matches).value_counts().head(10))\n",
    "    \n",
    "    # 打印前5行匹配结果示例\n",
    "    print(\"\\n前5行匹配结果示例:\")\n",
    "    for i in range(min(5, len(test_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n",
    "        print(f\"Levenshtein匹配: {test_df.iloc[i]['Levenshtein_Match']}\")\n",
    "        print(f\"FuzzyWuzzy匹配: {test_df.iloc[i]['FuzzyWuzzy_Match']}\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m     17\u001b[0m test_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     18\u001b[0m regular_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/limeng/SICT/lung_test/data/regular.csv\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 20\u001b[0m test_encoding \u001b[38;5;241m=\u001b[39m \u001b[43mdetect_encoding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtest_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     21\u001b[0m regular_encoding \u001b[38;5;241m=\u001b[39m detect_encoding(regular_file)\n\u001b[1;32m     23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m测试文件编码: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_encoding\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
      "Cell \u001b[0;32mIn[1], line 12\u001b[0m, in \u001b[0;36mdetect_encoding\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdetect_encoding\u001b[39m(file_path):\n\u001b[0;32m---> 12\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m     13\u001b[0m         result \u001b[38;5;241m=\u001b[39m chardet\u001b[38;5;241m.\u001b[39mdetect(f\u001b[38;5;241m.\u001b[39mread())\n\u001b[1;32m     14\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m result[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
      "File \u001b[0;32m~/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m    317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m    318\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    319\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    320\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    321\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    322\u001b[0m     )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "import Levenshtein\n",
    "from fuzzywuzzy import fuzz\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    paragraph_names = []\n",
    "    statement_names = []\n",
    "    value_item_names = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "        paragraph_names.append(row['ParagraphName'])\n",
    "        statement_names.append(row['StatementName'])\n",
    "        value_item_names.append(row['ValueItemName'])\n",
    "    \n",
    "    # 定义多种相似度计算方法\n",
    "    def calculate_similarities(query, candidates):\n",
    "        \"\"\"计算多种相似度指标，返回每种方法的最佳匹配和分数\"\"\"\n",
    "        results = {}\n",
    "        scores = {}\n",
    "        \n",
    "        # 1. TF-IDF + 余弦相似度\n",
    "        try:\n",
    "            # 对中文文本进行分词\n",
    "            segmented_query = ' '.join(jieba.cut(query))\n",
    "            segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
    "            \n",
    "            # 计算TF-IDF向量\n",
    "            vectorizer = TfidfVectorizer()\n",
    "            tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
    "            \n",
    "            # 计算余弦相似度\n",
    "            cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
    "            \n",
    "            # 找到最佳匹配\n",
    "            best_idx_tfidf = np.argmax(cosine_sim)\n",
    "            results['TF-IDF'] = candidates[best_idx_tfidf]\n",
    "            scores['TF-IDF'] = cosine_sim[best_idx_tfidf]\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF计算失败: {e}\")\n",
    "            results['TF-IDF'] = \"未匹配\"\n",
    "            scores['TF-IDF'] = 0.0\n",
    "        \n",
    "        # 2. Levenshtein距离（编辑距离）\n",
    "        try:\n",
    "            lev_distances = [Levenshtein.distance(query, c) for c in candidates]\n",
    "            # 将距离转换为相似度分数（越小越相似）\n",
    "            max_len = max(len(query), max(len(c) for c in candidates))\n",
    "            lev_similarities = [1 - dist/max_len for dist in lev_distances]\n",
    "            \n",
    "            best_idx_lev = np.argmax(lev_similarities)\n",
    "            results['Levenshtein'] = candidates[best_idx_lev]\n",
    "            scores['Levenshtein'] = lev_similarities[best_idx_lev]\n",
    "        except Exception as e:\n",
    "            print(f\"Levenshtein计算失败: {e}\")\n",
    "            results['Levenshtein'] = \"未匹配\"\n",
    "            scores['Levenshtein'] = 0.0\n",
    "        \n",
    "        # 3. FuzzyWuzzy比率\n",
    "        try:\n",
    "            fuzzy_ratios = [fuzz.ratio(query, c)/100 for c in candidates]\n",
    "            best_idx_fuzzy = np.argmax(fuzzy_ratios)\n",
    "            results['FuzzyWuzzy'] = candidates[best_idx_fuzzy]\n",
    "            scores['FuzzyWuzzy'] = fuzzy_ratios[best_idx_fuzzy]\n",
    "        except Exception as e:\n",
    "            print(f\"FuzzyWuzzy计算失败: {e}\")\n",
    "            results['FuzzyWuzzy'] = \"未匹配\"\n",
    "            scores['FuzzyWuzzy'] = 0.0\n",
    "        \n",
    "        return results, scores\n",
    "    \n",
    "    # 对每个测试字段进行匹配\n",
    "    tfidf_matches = []\n",
    "    levenshtein_matches = []\n",
    "    fuzzywuzzy_matches = []\n",
    "    best_matches = []\n",
    "    best_match_methods = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    for i in range(len(combined_fields)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        paragraph_name = paragraph_names[i]\n",
    "        value_item_name = value_item_names[i]\n",
    "        \n",
    "        # 1. 首先检查是否有注释包含ParagraphName\n",
    "        paragraph_matches = [ann for ann in regular_annotations if paragraph_name in ann]\n",
    "        \n",
    "        if paragraph_matches:\n",
    "            # 2. 如果有包含ParagraphName的注释，再检查是否有同时包含ValueItemName的\n",
    "            value_matches = [ann for ann in paragraph_matches if value_item_name in ann]\n",
    "            if value_matches:\n",
    "                # 找到同时包含ParagraphName和ValueItemName的注释\n",
    "                best_match = value_matches[0]  # 取第一个匹配\n",
    "                best_match_method = \"精确匹配(段落+值)\"\n",
    "            else:\n",
    "                # 只找到包含ParagraphName的注释\n",
    "                best_match = paragraph_matches[0]  # 取第一个匹配\n",
    "                best_match_method = \"段落匹配\"\n",
    "        else:\n",
    "            # 3. 如果没有包含ParagraphName的注释，直接使用相似度指标\n",
    "            matches, scores = calculate_similarities(query, regular_annotations)\n",
    "            \n",
    "            # 选择得分最高的方法\n",
    "            best_method = max(scores.items(), key=lambda x: x[1])[0]\n",
    "            best_match = matches[best_method]\n",
    "            best_match_method = f\"相似度({best_method})\"\n",
    "        \n",
    "        # 计算相似度匹配以便比较\n",
    "        matches, _ = calculate_similarities(query, regular_annotations)\n",
    "        tfidf_matches.append(matches.get('TF-IDF', \"未匹配\"))\n",
    "        levenshtein_matches.append(matches.get('Levenshtein', \"未匹配\"))\n",
    "        fuzzywuzzy_matches.append(matches.get('FuzzyWuzzy', \"未匹配\"))\n",
    "        \n",
    "        best_matches.append(best_match)\n",
    "        best_match_methods.append(best_match_method)\n",
    "    \n",
    "    # 获取ValueItemKind列的位置\n",
    "    kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
    "    \n",
    "    # 在ValueItemKind列前插入新的列\n",
    "    test_df.insert(kind_idx, 'Best_Match_Method', best_match_methods)\n",
    "    test_df.insert(kind_idx, 'Best_Match', best_matches)\n",
    "    test_df.insert(kind_idx, 'FuzzyWuzzy_Match', fuzzywuzzy_matches)\n",
    "    test_df.insert(kind_idx, 'Levenshtein_Match', levenshtein_matches)\n",
    "    test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n",
    "    \n",
    "    # 保存结果\n",
    "    test_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_with_annotations_all3.csv', \n",
    "                   index=False, encoding=test_encoding)\n",
    "    \n",
    "    # 打印匹配结果统计\n",
    "    print(\"\\n最佳匹配方法统计:\")\n",
    "    print(pd.Series(best_match_methods).value_counts())\n",
    "    \n",
    "    print(\"\\n各方法匹配结果统计:\")\n",
    "    print(\"最佳匹配结果:\")\n",
    "    print(pd.Series(best_matches).value_counts().head(10))\n",
    "    print(\"\\nTF-IDF匹配结果:\")\n",
    "    print(pd.Series(tfidf_matches).value_counts().head(10))\n",
    "    print(\"\\nLevenshtein匹配结果:\")\n",
    "    print(pd.Series(levenshtein_matches).value_counts().head(10))\n",
    "    print(\"\\nFuzzyWuzzy匹配结果:\")\n",
    "    print(pd.Series(fuzzywuzzy_matches).value_counts().head(10))\n",
    "    \n",
    "    # 打印前5行匹配结果示例\n",
    "    print(\"\\n前5行匹配结果示例:\")\n",
    "    for i in range(min(5, len(test_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"最佳匹配: {test_df.iloc[i]['Best_Match']} (方法: {test_df.iloc[i]['Best_Match_Method']})\")\n",
    "        print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n",
    "        print(f\"Levenshtein匹配: {test_df.iloc[i]['Levenshtein_Match']}\")\n",
    "        print(f\"FuzzyWuzzy匹配: {test_df.iloc[i]['FuzzyWuzzy_Match']}\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model cost 1.175 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理第 100/127 条记录...\n",
      "\n",
      "最佳匹配方法统计:\n",
      "TF-IDF相似度     89\n",
      "段落匹配          32\n",
      "精确匹配(段落+值)     6\n",
      "Name: count, dtype: int64\n",
      "\n",
      "匹配结果统计:\n",
      "最佳匹配结果:\n",
      "体格检查-体温               20\n",
      "主刀医师                  15\n",
      "卡号                    14\n",
      "手术经过                  13\n",
      "既往史-手术外伤史-手术史-手术时间    10\n",
      "患者姓名                   7\n",
      "患者基本情况                 6\n",
      "现病史-发病日期               6\n",
      "麻醉方式                   5\n",
      "手术名称                   5\n",
      "Name: count, dtype: int64\n",
      "\n",
      "TF-IDF匹配结果:\n",
      "主刀医师                  15\n",
      "卡号                    14\n",
      "手术经过                  13\n",
      "既往史-手术外伤史-手术史-手术时间    10\n",
      "患者姓名                   7\n",
      "患者基本情况                 6\n",
      "手术名称                   5\n",
      "麻醉方式                   5\n",
      "参与人员                   4\n",
      "主诉                     3\n",
      "Name: count, dtype: int64\n",
      "\n",
      "前5行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "最佳匹配: 患者姓名 (方法: TF-IDF相似度)\n",
      "相似度分数: 0.5819\n",
      "TF-IDF匹配: 患者姓名\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "最佳匹配: 患者性别 (方法: TF-IDF相似度)\n",
      "相似度分数: 0.6637\n",
      "TF-IDF匹配: 患者性别\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "最佳匹配: 患者年龄 (方法: TF-IDF相似度)\n",
      "相似度分数: 0.6576\n",
      "TF-IDF匹配: 患者年龄\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-主诉-主诉\n",
      "最佳匹配: 主诉 (方法: TF-IDF相似度)\n",
      "相似度分数: 0.7628\n",
      "TF-IDF匹配: 主诉\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-入院日期-入院日期\n",
      "最佳匹配: 入院时情况-入院时间 (方法: TF-IDF相似度)\n",
      "相似度分数: 0.5297\n",
      "TF-IDF匹配: 入院时情况-入院时间\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    paragraph_names = []\n",
    "    statement_names = []\n",
    "    value_item_names = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "        paragraph_names.append(row['ParagraphName'])\n",
    "        statement_names.append(row['StatementName'])\n",
    "        value_item_names.append(row['ValueItemName'])\n",
    "    \n",
    "    # 定义TF-IDF相似度计算方法\n",
    "    def calculate_tfidf_similarity(query, candidates):\n",
    "        \"\"\"计算TF-IDF相似度，返回最佳匹配和分数\"\"\"\n",
    "        try:\n",
    "            # 对中文文本进行分词\n",
    "            segmented_query = ' '.join(jieba.cut(query))\n",
    "            segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
    "            \n",
    "            # 计算TF-IDF向量\n",
    "            vectorizer = TfidfVectorizer()\n",
    "            tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
    "            \n",
    "            # 计算余弦相似度\n",
    "            cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
    "            \n",
    "            # 找到最佳匹配\n",
    "            best_idx = np.argmax(cosine_sim)\n",
    "            return candidates[best_idx], cosine_sim[best_idx]\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF计算失败: {e}\")\n",
    "            return \"未匹配\", 0.0\n",
    "    \n",
    "    # 对每个测试字段进行匹配\n",
    "    tfidf_matches = []\n",
    "    best_matches = []\n",
    "    best_match_methods = []\n",
    "    similarity_scores = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    for i in range(len(combined_fields)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(combined_fields)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        paragraph_name = paragraph_names[i]\n",
    "        value_item_name = value_item_names[i]\n",
    "        \n",
    "        # 1. 首先检查是否有注释包含ParagraphName\n",
    "        paragraph_matches = [ann for ann in regular_annotations if paragraph_name in ann]\n",
    "        \n",
    "        if paragraph_matches:\n",
    "            # 2. 如果有包含ParagraphName的注释，再检查是否有同时包含ValueItemName的\n",
    "            value_matches = [ann for ann in paragraph_matches if value_item_name in ann]\n",
    "            if value_matches:\n",
    "                # 找到同时包含ParagraphName和ValueItemName的注释\n",
    "                best_match = value_matches[0]  # 取第一个匹配\n",
    "                best_match_method = \"精确匹配(段落+值)\"\n",
    "                similarity_score = 1.0  # 精确匹配给予最高分\n",
    "            else:\n",
    "                # 只找到包含ParagraphName的注释\n",
    "                best_match = paragraph_matches[0]  # 取第一个匹配\n",
    "                best_match_method = \"段落匹配\"\n",
    "                similarity_score = 0.8  # 段落匹配给予较高分\n",
    "        else:\n",
    "            # 3. 如果没有包含ParagraphName的注释，使用TF-IDF相似度\n",
    "            best_match, similarity_score = calculate_tfidf_similarity(query, regular_annotations)\n",
    "            best_match_method = \"TF-IDF相似度\"\n",
    "        \n",
    "        # 计算TF-IDF匹配以便比较\n",
    "        tfidf_match, _ = calculate_tfidf_similarity(query, regular_annotations)\n",
    "        tfidf_matches.append(tfidf_match)\n",
    "        \n",
    "        best_matches.append(best_match)\n",
    "        best_match_methods.append(best_match_method)\n",
    "        similarity_scores.append(similarity_score)\n",
    "    \n",
    "    # 获取ValueItemKind列的位置\n",
    "    kind_idx = test_df.columns.get_loc('ValueItemKind')\n",
    "    \n",
    "    # 在ValueItemKind列前插入新的列\n",
    "    test_df.insert(kind_idx, 'Similarity_Score', similarity_scores)\n",
    "    test_df.insert(kind_idx, 'Best_Match_Method', best_match_methods)\n",
    "    test_df.insert(kind_idx, 'Best_Match', best_matches)\n",
    "    test_df.insert(kind_idx, 'TFIDF_Match', tfidf_matches)\n",
    "    \n",
    "    # 保存结果\n",
    "    test_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_with_tfidf.csv', \n",
    "                   index=False, encoding=test_encoding)\n",
    "    \n",
    "    # 打印匹配结果统计\n",
    "    print(\"\\n最佳匹配方法统计:\")\n",
    "    print(pd.Series(best_match_methods).value_counts())\n",
    "    \n",
    "    print(\"\\n匹配结果统计:\")\n",
    "    print(\"最佳匹配结果:\")\n",
    "    print(pd.Series(best_matches).value_counts().head(10))\n",
    "    print(\"\\nTF-IDF匹配结果:\")\n",
    "    print(pd.Series(tfidf_matches).value_counts().head(10))\n",
    "    \n",
    "    # 打印前5行匹配结果示例\n",
    "    print(\"\\n前5行匹配结果示例:\")\n",
    "    for i in range(min(5, len(test_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"最佳匹配: {test_df.iloc[i]['Best_Match']} (方法: {test_df.iloc[i]['Best_Match_Method']})\")\n",
    "        print(f\"相似度分数: {test_df.iloc[i]['Similarity_Score']:.4f}\")\n",
    "        print(f\"TF-IDF匹配: {test_df.iloc[i]['TFIDF_Match']}\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n",
      "处理第 100/127 条记录...\n",
      "\n",
      "匹配完成，共处理 127 条记录\n",
      "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\n",
      "\n",
      "前3行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "匹配1: 患者姓名 (分数: 0.5819)\n",
      "匹配2: 患者姓名 (分数: 0.5819)\n",
      "匹配3: 姓名 (分数: 0.5574)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "匹配1: 患者性别 (分数: 0.6637)\n",
      "匹配2: 性别 (分数: 0.6416)\n",
      "匹配3: 性别 (分数: 0.6416)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "匹配1: 患者年龄 (分数: 0.6576)\n",
      "匹配2: 年龄 (分数: 0.6348)\n",
      "匹配3: 年龄 (分数: 0.6348)\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 定义TF-IDF相似度计算方法，返回前3个最佳匹配\n",
    "    def calculate_top3_tfidf_similarity(query, candidates):\n",
    "        \"\"\"计算TF-IDF相似度，返回前3个最佳匹配和分数\"\"\"\n",
    "        try:\n",
    "            # 对中文文本进行分词\n",
    "            segmented_query = ' '.join(jieba.cut(query))\n",
    "            segmented_candidates = [' '.join(jieba.cut(c)) for c in candidates]\n",
    "            \n",
    "            # 计算TF-IDF向量\n",
    "            vectorizer = TfidfVectorizer()\n",
    "            tfidf_matrix = vectorizer.fit_transform([segmented_query] + segmented_candidates)\n",
    "            \n",
    "            # 计算余弦相似度\n",
    "            cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()\n",
    "            \n",
    "            # 找到前3个最佳匹配\n",
    "            top3_indices = np.argsort(cosine_sim)[::-1][:3]\n",
    "            top3_scores = cosine_sim[top3_indices]\n",
    "            \n",
    "            return top3_indices, top3_scores\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF计算失败: {e}\")\n",
    "            return [-1, -1, -1], [0.0, 0.0, 0.0]\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_data = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    for i in range(len(test_df)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        \n",
    "        # 使用TF-IDF相似度匹配，获取前3个最佳匹配\n",
    "        top3_indices, top3_scores = calculate_top3_tfidf_similarity(query, regular_annotations)\n",
    "        \n",
    "        # 获取测试数据的相关字段\n",
    "        paragraph_name = test_df.iloc[i]['ParagraphName']\n",
    "        statement_name = test_df.iloc[i]['StatementName']\n",
    "        value_item_name = test_df.iloc[i]['ValueItemName']\n",
    "        display_string = test_df.iloc[i]['DisplayString']\n",
    "        \n",
    "        # 获取前3个规范数据的相关字段\n",
    "        regular_nodes = []\n",
    "        regular_annotations_matched = []\n",
    "        regular_descriptions = []\n",
    "        \n",
    "        for idx, score in zip(top3_indices, top3_scores):\n",
    "            if idx >= 0:\n",
    "                regular_nodes.append(regular_df.iloc[idx]['节点名'])\n",
    "                regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n",
    "                regular_descriptions.append(regular_df.iloc[idx]['说明'])\n",
    "            else:\n",
    "                regular_nodes.append(\"未匹配\")\n",
    "                regular_annotations_matched.append(\"未匹配\")\n",
    "                regular_descriptions.append(\"未匹配\")\n",
    "        \n",
    "        # 确保有3个结果（如果候选项少于3个）\n",
    "        while len(regular_nodes) < 3:\n",
    "            regular_nodes.append(\"未匹配\")\n",
    "            regular_annotations_matched.append(\"未匹配\")\n",
    "            regular_descriptions.append(\"未匹配\")\n",
    "            top3_scores = np.append(top3_scores, 0.0)\n",
    "        \n",
    "        # 添加到结果数据\n",
    "        result_data.append({\n",
    "            'ParagraphName': paragraph_name,\n",
    "            'StatementName': statement_name,\n",
    "            'ValueItemName': value_item_name,\n",
    "            'DisplayString': display_string,\n",
    "            '规范节点名1': regular_nodes[0],\n",
    "            '规范注释1': regular_annotations_matched[0],\n",
    "            '规范说明1': regular_descriptions[0],\n",
    "            '相似度分数1': top3_scores[0],\n",
    "            '规范节点名2': regular_nodes[1],\n",
    "            '规范注释2': regular_annotations_matched[1],\n",
    "            '规范说明2': regular_descriptions[1],\n",
    "            '相似度分数2': top3_scores[1],\n",
    "            '规范节点名3': regular_nodes[2],\n",
    "            '规范注释3': regular_annotations_matched[2],\n",
    "            '规范说明3': regular_descriptions[2],\n",
    "            '相似度分数3': top3_scores[2]\n",
    "        })\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_df = pd.DataFrame(result_data)\n",
    "    \n",
    "    # 保存结果\n",
    "    result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv', \n",
    "                     index=False, encoding=test_encoding)\n",
    "    \n",
    "    print(f\"\\n匹配完成，共处理 {len(test_df)} 条记录\")\n",
    "    print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\")\n",
    "    \n",
    "    # 打印前3行匹配结果示例\n",
    "    print(\"\\n前3行匹配结果示例:\")\n",
    "    for i in range(min(3, len(result_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n",
    "        print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n",
    "        print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "加载Sentence Transformer模型...\n",
      "对规范注释进行编码...\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n",
      "处理第 100/127 条记录...\n",
      "\n",
      "匹配完成，共处理 127 条记录\n",
      "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv\n",
      "\n",
      "前3行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "匹配1: 讨论经过-病理科-医师姓名 (分数: 0.8398)\n",
      "匹配2: 讨论经过-放射科-医师姓名 (分数: 0.8398)\n",
      "匹配3: 讨论经过-放化疗科-医师姓名 (分数: 0.8384)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "匹配1: 辅助检查-乙肝病毒-医院名称 (分数: 0.8360)\n",
      "匹配2: 辅助检查-骨扫描检查-医院名称 (分数: 0.8325)\n",
      "匹配3: 讨论经过-病理科-医师姓名 (分数: 0.8265)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "匹配1: 入院时情况-患者年龄 (分数: 0.8390)\n",
      "匹配2: 辅助检查-乙肝病毒-医院名称 (分数: 0.8046)\n",
      "匹配3: 辅助检查-骨扫描检查-医院名称 (分数: 0.8012)\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sentence_transformers import SentenceTransformer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import torch\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 加载预训练的中文Sentence Transformer模型\n",
    "    print(\"加载Sentence Transformer模型...\")\n",
    "    \n",
    "            # 如果都失败，尝试加载基础模型\n",
    "    model = SentenceTransformer('/home/limeng/SICT/lung_test/all-MiniLM-L6-v2')\n",
    "\n",
    "    # 定义Sentence Transformer相似度计算方法，返回前3个最佳匹配\n",
    "    def calculate_top3_transformer_similarity(query, candidates, model):\n",
    "        \"\"\"计算Sentence Transformer相似度，返回前3个最佳匹配和分数\"\"\"\n",
    "        try:\n",
    "            # 编码查询和候选项\n",
    "            query_embedding = model.encode([query], convert_to_tensor=True)\n",
    "            candidate_embeddings = model.encode(candidates, convert_to_tensor=True)\n",
    "            \n",
    "            # 计算余弦相似度\n",
    "            cosine_scores = cosine_similarity(\n",
    "                query_embedding.cpu().numpy(), \n",
    "                candidate_embeddings.cpu().numpy()\n",
    "            )[0]\n",
    "            \n",
    "            # 找到前3个最佳匹配\n",
    "            top3_indices = np.argsort(cosine_scores)[::-1][:3]\n",
    "            top3_scores = cosine_scores[top3_indices]\n",
    "            \n",
    "            return top3_indices, top3_scores\n",
    "        except Exception as e:\n",
    "            print(f\"Transformer相似度计算失败: {e}\")\n",
    "            return [-1, -1, -1], [0.0, 0.0, 0.0]\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_data = []\n",
    "    \n",
    "    # 先对规范注释进行编码，避免重复计算\n",
    "    print(\"对规范注释进行编码...\")\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_data = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    for i in range(len(test_df)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        \n",
    "        # 使用Sentence Transformer相似度匹配，获取前3个最佳匹配\n",
    "        top3_indices, top3_scores = calculate_top3_transformer_similarity(query, regular_annotations, model)\n",
    "        \n",
    "        # 获取测试数据的相关字段\n",
    "        paragraph_name = test_df.iloc[i]['ParagraphName']\n",
    "        statement_name = test_df.iloc[i]['StatementName']\n",
    "        value_item_name = test_df.iloc[i]['ValueItemName']\n",
    "        display_string = test_df.iloc[i]['DisplayString']\n",
    "        \n",
    "        # 获取前3个规范数据的相关字段\n",
    "        regular_nodes = []\n",
    "        regular_annotations_matched = []\n",
    "        regular_descriptions = []\n",
    "        \n",
    "        for idx, score in zip(top3_indices, top3_scores):\n",
    "            if idx >= 0:\n",
    "                regular_nodes.append(regular_df.iloc[idx]['节点名'])\n",
    "                regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n",
    "                regular_descriptions.append(regular_df.iloc[idx]['说明'])\n",
    "            else:\n",
    "                regular_nodes.append(\"未匹配\")\n",
    "                regular_annotations_matched.append(\"未匹配\")\n",
    "                regular_descriptions.append(\"未匹配\")\n",
    "        \n",
    "        # 确保有3个结果（如果候选项少于3个）\n",
    "        while len(regular_nodes) < 3:\n",
    "            regular_nodes.append(\"未匹配\")\n",
    "            regular_annotations_matched.append(\"未匹配\")\n",
    "            regular_descriptions.append(\"未匹配\")\n",
    "            top3_scores = np.append(top3_scores, 0.0)\n",
    "        \n",
    "        # 添加到结果数据\n",
    "        result_data.append({\n",
    "            'ParagraphName': paragraph_name,\n",
    "            'StatementName': statement_name,\n",
    "            'ValueItemName': value_item_name,\n",
    "            'DisplayString': display_string,\n",
    "            '规范节点名1': regular_nodes[0],\n",
    "            '规范注释1': regular_annotations_matched[0],\n",
    "            '规范说明1': regular_descriptions[0],\n",
    "            '相似度分数1': top3_scores[0],\n",
    "            '规范节点名2': regular_nodes[1],\n",
    "            '规范注释2': regular_annotations_matched[1],\n",
    "            '规范说明2': regular_descriptions[1],\n",
    "            '相似度分数2': top3_scores[1],\n",
    "            '规范节点名3': regular_nodes[2],\n",
    "            '规范注释3': regular_annotations_matched[2],\n",
    "            '规范说明3': regular_descriptions[2],\n",
    "            '相似度分数3': top3_scores[2]\n",
    "        })\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_df = pd.DataFrame(result_data)\n",
    "    \n",
    "    # 保存结果\n",
    "    result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv', \n",
    "                     index=False, encoding=test_encoding)\n",
    "    \n",
    "    print(f\"\\n匹配完成，共处理 {len(test_df)} 条记录\")\n",
    "    print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_transformer_top3_match.csv\")\n",
    "    \n",
    "    # 打印前3行匹配结果示例\n",
    "    print(\"\\n前3行匹配结果示例:\")\n",
    "    for i in range(min(3, len(result_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n",
    "        print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n",
    "        print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "初始化TF-IDF向量化器...\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
      "  warnings.warn(\n",
      "Building prefix dict from the default dictionary ...\n",
      "Dumping model to file cache /tmp/jieba.cache\n",
      "Loading model cost 0.934 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理第 100/127 条记录...\n",
      "\n",
      "匹配完成，共处理 127 条记录\n",
      "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\n",
      "\n",
      "前3行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "匹配1: 患者姓名 (分数: 0.4840)\n",
      "匹配2: 患者姓名 (分数: 0.4840)\n",
      "匹配3: 姓名 (分数: 0.4637)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "匹配1: 患者性别 (分数: 0.5414)\n",
      "匹配2: 性别 (分数: 0.5235)\n",
      "匹配3: 性别 (分数: 0.5235)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "匹配1: 患者年龄 (分数: 0.5357)\n",
      "匹配2: 年龄 (分数: 0.5170)\n",
      "匹配3: 年龄 (分数: 0.5170)\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 使用TF-IDF向量化文本\n",
    "    print(\"初始化TF-IDF向量化器...\")\n",
    "    \n",
    "    # 对中文文本进行分词处理\n",
    "    def tokenize_chinese(text):\n",
    "        return list(jieba.cut(text))\n",
    "    \n",
    "    # 初始化TF-IDF向量化器\n",
    "    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
    "    \n",
    "    # 定义TF-IDF相似度计算方法，返回前3个最佳匹配\n",
    "    def calculate_top3_tfidf_similarity(query, candidates, vectorizer):\n",
    "        \"\"\"计算TF-IDF相似度，返回前3个最佳匹配和分数\"\"\"\n",
    "        try:\n",
    "            # 将所有文本合并为一个列表进行向量化\n",
    "            all_texts = [query] + candidates\n",
    "            \n",
    "            # 拟合并转换所有文本\n",
    "            tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
    "            \n",
    "            # 计算查询与所有候选项的余弦相似度\n",
    "            query_vector = tfidf_matrix[0:1]\n",
    "            candidate_vectors = tfidf_matrix[1:]\n",
    "            \n",
    "            cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
    "            \n",
    "            # 找到前3个最佳匹配\n",
    "            top3_indices = np.argsort(cosine_scores)[::-1][:3]\n",
    "            top3_scores = cosine_scores[top3_indices]\n",
    "            \n",
    "            return top3_indices, top3_scores\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF相似度计算失败: {e}\")\n",
    "            return [-1, -1, -1], [0.0, 0.0, 0.0]\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_data = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    for i in range(len(test_df)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        \n",
    "        # 使用TF-IDF相似度匹配，获取前3个最佳匹配\n",
    "        top3_indices, top3_scores = calculate_top3_tfidf_similarity(query, regular_annotations, tfidf_vectorizer)\n",
    "        \n",
    "        # 获取测试数据的相关字段\n",
    "        paragraph_name = test_df.iloc[i]['ParagraphName']\n",
    "        statement_name = test_df.iloc[i]['StatementName']\n",
    "        value_item_name = test_df.iloc[i]['ValueItemName']\n",
    "        display_string = test_df.iloc[i]['DisplayString']\n",
    "        \n",
    "        # 获取前3个规范数据的相关字段\n",
    "        regular_nodes = []\n",
    "        regular_annotations_matched = []\n",
    "        regular_descriptions = []\n",
    "        \n",
    "        for idx, score in zip(top3_indices, top3_scores):\n",
    "            if idx >= 0:\n",
    "                regular_nodes.append(regular_df.iloc[idx]['节点名'])\n",
    "                regular_annotations_matched.append(regular_df.iloc[idx]['注释'])\n",
    "                regular_descriptions.append(regular_df.iloc[idx]['说明'])\n",
    "            else:\n",
    "                regular_nodes.append(\"未匹配\")\n",
    "                regular_annotations_matched.append(\"未匹配\")\n",
    "                regular_descriptions.append(\"未匹配\")\n",
    "        \n",
    "        # 确保有3个结果（如果候选项少于3个）\n",
    "        while len(regular_nodes) < 3:\n",
    "            regular_nodes.append(\"未匹配\")\n",
    "            regular_annotations_matched.append(\"未匹配\")\n",
    "            regular_descriptions.append(\"未匹配\")\n",
    "            top3_scores = np.append(top3_scores, 0.0)\n",
    "        \n",
    "        # 添加到结果数据\n",
    "        result_data.append({\n",
    "            'ParagraphName': paragraph_name,\n",
    "            'StatementName': statement_name,\n",
    "            'ValueItemName': value_item_name,\n",
    "            'DisplayString': display_string,\n",
    "            '规范节点名1': regular_nodes[0],\n",
    "            '规范注释1': regular_annotations_matched[0],\n",
    "            '规范说明1': regular_descriptions[0],\n",
    "            '相似度分数1': top3_scores[0],\n",
    "            '规范节点名2': regular_nodes[1],\n",
    "            '规范注释2': regular_annotations_matched[1],\n",
    "            '规范说明2': regular_descriptions[1],\n",
    "            '相似度分数2': top3_scores[1],\n",
    "            '规范节点名3': regular_nodes[2],\n",
    "            '规范注释3': regular_annotations_matched[2],\n",
    "            '规范说明3': regular_descriptions[2],\n",
    "            '相似度分数3': top3_scores[2]\n",
    "        })\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_df = pd.DataFrame(result_data)\n",
    "    \n",
    "    # 保存结果\n",
    "    result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv', \n",
    "                     index=False, encoding=test_encoding)\n",
    "    \n",
    "    print(f\"\\n匹配完成，共处理 {len(test_df)} 条记录\")\n",
    "    print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_top3_match.csv\")\n",
    "    \n",
    "    # 打印前3行匹配结果示例\n",
    "    print(\"\\n前3行匹配结果示例:\")\n",
    "    for i in range(min(3, len(result_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"匹配1: {result_df.iloc[i]['规范注释1']} (分数: {result_df.iloc[i]['相似度分数1']:.4f})\")\n",
    "        print(f\"匹配2: {result_df.iloc[i]['规范注释2']} (分数: {result_df.iloc[i]['相似度分数2']:.4f})\")\n",
    "        print(f\"匹配3: {result_df.iloc[i]['规范注释3']} (分数: {result_df.iloc[i]['相似度分数3']:.4f})\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "初始化TF-IDF向量化器...\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理第 100/127 条记录...\n",
      "\n",
      "匹配完成，共处理 127 条记录\n",
      "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv\n",
      "\n",
      "前3行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "TF-IDF匹配结果:\n",
      "  匹配1: 患者姓名 (分数: 0.4840)\n",
      "  匹配2: 患者姓名 (分数: 0.4840)\n",
      "  匹配3: 姓名 (分数: 0.4637)\n",
      "FuzzyWuzzy匹配结果:\n",
      "  匹配1: 患者姓名 (分数: 0.3300)\n",
      "  匹配2: 入院时情况-患者姓名 (分数: 0.3300)\n",
      "  匹配3: 患者姓名 (分数: 0.3300)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "TF-IDF匹配结果:\n",
      "  匹配1: 患者性别 (分数: 0.5414)\n",
      "  匹配2: 性别 (分数: 0.5235)\n",
      "  匹配3: 性别 (分数: 0.5235)\n",
      "FuzzyWuzzy匹配结果:\n",
      "  匹配1: 患者姓名 (分数: 0.3600)\n",
      "  匹配2: 入院时情况-患者姓名 (分数: 0.3600)\n",
      "  匹配3: 患者姓名 (分数: 0.3600)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "TF-IDF匹配结果:\n",
      "  匹配1: 患者年龄 (分数: 0.5357)\n",
      "  匹配2: 年龄 (分数: 0.5170)\n",
      "  匹配3: 年龄 (分数: 0.5170)\n",
      "FuzzyWuzzy匹配结果:\n",
      "  匹配1: 患者姓名 (分数: 0.3600)\n",
      "  匹配2: 入院时情况-患者姓名 (分数: 0.3600)\n",
      "  匹配3: 患者姓名 (分数: 0.3600)\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "from fuzzywuzzy import fuzz\n",
    "from fuzzywuzzy import process\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 使用TF-IDF向量化文本\n",
    "    print(\"初始化TF-IDF向量化器...\")\n",
    "    \n",
    "    # 对中文文本进行分词处理\n",
    "    def tokenize_chinese(text):\n",
    "        return list(jieba.cut(text))\n",
    "    \n",
    "    # 初始化TF-IDF向量化器\n",
    "    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
    "    \n",
    "    # 定义TF-IDF相似度计算方法，返回最佳匹配（最多3个）\n",
    "    def calculate_tfidf_similarity(query, candidates, vectorizer, max_matches=3, threshold=0.1):\n",
    "        \"\"\"计算TF-IDF相似度，返回最佳匹配（最多max_matches个）\"\"\"\n",
    "        try:\n",
    "            # 将所有文本合并为一个列表进行向量化\n",
    "            all_texts = [query] + candidates\n",
    "            \n",
    "            # 拟合并转换所有文本\n",
    "            tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
    "            \n",
    "            # 计算查询与所有候选项的余弦相似度\n",
    "            query_vector = tfidf_matrix[0:1]\n",
    "            candidate_vectors = tfidf_matrix[1:]\n",
    "            \n",
    "            cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
    "            \n",
    "            # 找到相似度大于阈值的匹配\n",
    "            valid_indices = np.where(cosine_scores > threshold)[0]\n",
    "            \n",
    "            # 按相似度降序排序\n",
    "            sorted_indices = valid_indices[np.argsort(cosine_scores[valid_indices])[::-1]]\n",
    "            \n",
    "            # 最多取max_matches个\n",
    "            top_indices = sorted_indices[:max_matches]\n",
    "            top_scores = cosine_scores[top_indices]\n",
    "            \n",
    "            return top_indices, top_scores\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF相似度计算失败: {e}\")\n",
    "            return np.array([]), np.array([])\n",
    "    \n",
    "    # 定义FuzzyWuzzy相似度计算方法，返回最佳匹配（最多3个）\n",
    "    def calculate_fuzzy_similarity(query, candidates, max_matches=3):\n",
    "        \"\"\"计算FuzzyWuzzy相似度，返回最佳匹配（最多max_matches个）\"\"\"\n",
    "        try:\n",
    "            # 使用process.extract获取最佳匹配\n",
    "            matches = process.extract(query, candidates, limit=max_matches, scorer=fuzz.token_sort_ratio)\n",
    "            \n",
    "            # 提取索引和分数\n",
    "            indices = []\n",
    "            scores = []\n",
    "            \n",
    "            for match in matches:\n",
    "                # match格式为(匹配文本, 分数)\n",
    "                matched_text, score = match\n",
    "                # 找到匹配文本在原始列表中的索引\n",
    "                idx = candidates.index(matched_text)\n",
    "                indices.append(idx)\n",
    "                scores.append(score / 100.0)  # 将分数归一化到0-1范围\n",
    "            \n",
    "            return np.array(indices), np.array(scores)\n",
    "        except Exception as e:\n",
    "            print(f\"FuzzyWuzzy相似度计算失败: {e}\")\n",
    "            return np.array([]), np.array([])\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_data = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    \n",
    "    for i in range(len(test_df)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        \n",
    "        # 使用TF-IDF相似度匹配，获取最佳匹配（最多3个）\n",
    "        tfidf_indices, tfidf_scores = calculate_tfidf_similarity(query, regular_annotations, tfidf_vectorizer)\n",
    "        \n",
    "        # 使用FuzzyWuzzy相似度匹配，获取最佳匹配（最多3个）\n",
    "        fuzzy_indices, fuzzy_scores = calculate_fuzzy_similarity(query, regular_annotations)\n",
    "        \n",
    "        # 获取测试数据的相关字段\n",
    "        paragraph_name = test_df.iloc[i]['ParagraphName']\n",
    "        statement_name = test_df.iloc[i]['StatementName']\n",
    "        value_item_name = test_df.iloc[i]['ValueItemName']\n",
    "        display_string = test_df.iloc[i]['DisplayString']\n",
    "        \n",
    "        # 创建结果字典\n",
    "        result_dict = {\n",
    "            'ParagraphName': paragraph_name,\n",
    "            'StatementName': statement_name,\n",
    "            'ValueItemName': value_item_name,\n",
    "            'DisplayString': display_string\n",
    "        }\n",
    "        \n",
    "        # 添加TF-IDF匹配结果\n",
    "        for j in range(min(3, len(tfidf_indices))):\n",
    "            idx = tfidf_indices[j]\n",
    "            score = tfidf_scores[j]\n",
    "            result_dict[f'TFIDF_规范节点名{j+1}'] = regular_df.iloc[idx]['节点名']\n",
    "            result_dict[f'TFIDF_规范注释{j+1}'] = regular_df.iloc[idx]['注释']\n",
    "            result_dict[f'TFIDF_规范说明{j+1}'] = regular_df.iloc[idx]['说明']\n",
    "            result_dict[f'TFIDF_相似度分数{j+1}'] = score\n",
    "        \n",
    "        # 添加FuzzyWuzzy匹配结果\n",
    "        for j in range(min(3, len(fuzzy_indices))):\n",
    "            idx = fuzzy_indices[j]\n",
    "            score = fuzzy_scores[j]\n",
    "            result_dict[f'Fuzzy_规范节点名{j+1}'] = regular_df.iloc[idx]['节点名']\n",
    "            result_dict[f'Fuzzy_规范注释{j+1}'] = regular_df.iloc[idx]['注释']\n",
    "            result_dict[f'Fuzzy_规范说明{j+1}'] = regular_df.iloc[idx]['说明']\n",
    "            result_dict[f'Fuzzy_相似度分数{j+1}'] = score\n",
    "        \n",
    "        # 添加到结果数据\n",
    "        result_data.append(result_dict)\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_df = pd.DataFrame(result_data)\n",
    "    \n",
    "    # 保存结果\n",
    "    result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv', \n",
    "                     index=False, encoding=test_encoding)\n",
    "    \n",
    "    print(f\"\\n匹配完成，共处理 {len(test_df)} 条记录\")\n",
    "    print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_fuzzy_match.csv\")\n",
    "    \n",
    "    # 打印前3行匹配结果示例\n",
    "    print(\"\\n前3行匹配结果示例:\")\n",
    "    for i in range(min(3, len(result_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        \n",
    "        print(\"TF-IDF匹配结果:\")\n",
    "        for j in range(1, 4):\n",
    "            if f'TFIDF_规范注释{j}' in result_df.columns and not pd.isna(result_df.iloc[i].get(f'TFIDF_规范注释{j}', None)):\n",
    "                print(f\"  匹配{j}: {result_df.iloc[i][f'TFIDF_规范注释{j}']} (分数: {result_df.iloc[i][f'TFIDF_相似度分数{j}']:.4f})\")\n",
    "        \n",
    "        print(\"FuzzyWuzzy匹配结果:\")\n",
    "        for j in range(1, 4):\n",
    "            if f'Fuzzy_规范注释{j}' in result_df.columns and not pd.isna(result_df.iloc[i].get(f'Fuzzy_规范注释{j}', None)):\n",
    "                print(f\"  匹配{j}: {result_df.iloc[i][f'Fuzzy_规范注释{j}']} (分数: {result_df.iloc[i][f'Fuzzy_相似度分数{j}']:.4f})\")\n",
    "        \n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
      "  warnings.warn(\n",
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "初始化TF-IDF向量化器...\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model cost 0.942 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理第 100/127 条记录...\n",
      "\n",
      "匹配完成，共处理 127 条记录\n",
      "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\n",
      "\n",
      "前3行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "最佳匹配: 患者姓名 (分数: 0.4840)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "最佳匹配: 患者性别 (分数: 0.5414)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "最佳匹配: 患者年龄 (分数: 0.5357)\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 使用TF-IDF向量化文本\n",
    "    print(\"初始化TF-IDF向量化器...\")\n",
    "    \n",
    "    # 对中文文本进行分词处理\n",
    "    def tokenize_chinese(text):\n",
    "        return list(jieba.cut(text))\n",
    "    \n",
    "    # 初始化TF-IDF向量化器\n",
    "    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
    "    \n",
    "    # 定义TF-IDF相似度计算方法，只返回最佳匹配（分数最高的）\n",
    "    def calculate_best_tfidf_match(query, candidates, vectorizer):\n",
    "        \"\"\"计算TF-IDF相似度，只返回最佳匹配（分数最高的）\"\"\"\n",
    "        try:\n",
    "            # 将所有文本合并为一个列表进行向量化\n",
    "            all_texts = [query] + candidates\n",
    "            \n",
    "            # 拟合并转换所有文本\n",
    "            tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
    "            \n",
    "            # 计算查询与所有候选项的余弦相似度\n",
    "            query_vector = tfidf_matrix[0:1]\n",
    "            candidate_vectors = tfidf_matrix[1:]\n",
    "            \n",
    "            cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
    "            \n",
    "            # 找到分数最高的匹配\n",
    "            best_index = np.argmax(cosine_scores)\n",
    "            best_score = cosine_scores[best_index]\n",
    "            \n",
    "            return best_index, best_score\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF相似度计算失败: {e}\")\n",
    "            return -1, 0.0\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_data = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    \n",
    "    for i in range(len(test_df)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        \n",
    "        # 使用TF-IDF相似度匹配，只获取最佳匹配\n",
    "        best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n",
    "        \n",
    "        # 获取测试数据的相关字段\n",
    "        row = test_df.iloc[i]\n",
    "        \n",
    "        # 创建结果字典，包含原始字段\n",
    "        result_dict = {\n",
    "            'ParagraphName': row['ParagraphName'],\n",
    "            'StatementName': row['StatementName'],\n",
    "            'ValueItemName': row['ValueItemName'],\n",
    "            'DisplayString': row['DisplayString']\n",
    "        }\n",
    "        \n",
    "        # 添加SFZH, XGRQ, IPBLH字段（如果存在）\n",
    "        if 'SFZH' in test_df.columns:\n",
    "            result_dict['SFZH'] = row['SFZH']\n",
    "        if 'XGRQ' in test_df.columns:\n",
    "            result_dict['XGRQ'] = row['XGRQ']\n",
    "        if 'IPBLH' in test_df.columns:\n",
    "            result_dict['IPBLH'] = row['IPBLH']\n",
    "        \n",
    "        # 添加最佳TF-IDF匹配结果\n",
    "        if best_index >= 0:\n",
    "            result_dict['TFIDF_规范节点名'] = regular_df.iloc[best_index]['节点名']\n",
    "            result_dict['TFIDF_规范注释'] = regular_df.iloc[best_index]['注释']\n",
    "            result_dict['TFIDF_规范说明'] = regular_df.iloc[best_index]['说明']\n",
    "            result_dict['TFIDF_相似度分数'] = best_score\n",
    "        else:\n",
    "            result_dict['TFIDF_规范节点名'] = ''\n",
    "            result_dict['TFIDF_规范注释'] = ''\n",
    "            result_dict['TFIDF_规范说明'] = ''\n",
    "            result_dict['TFIDF_相似度分数'] = 0.0\n",
    "        \n",
    "        # 添加到结果数据\n",
    "        result_data.append(result_dict)\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_df = pd.DataFrame(result_data)\n",
    "    \n",
    "    # 保存结果\n",
    "    result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv', \n",
    "                     index=False, encoding=test_encoding)\n",
    "    \n",
    "    print(f\"\\n匹配完成，共处理 {len(test_df)} 条记录\")\n",
    "    print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\")\n",
    "    \n",
    "    # 打印前3行匹配结果示例\n",
    "    print(\"\\n前3行匹配结果示例:\")\n",
    "    for i in range(min(3, len(result_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"最佳匹配: {result_df.iloc[i]['TFIDF_规范注释']} (分数: {result_df.iloc[i]['TFIDF_相似度分数']:.4f})\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n",
      "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
      "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n",
      "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n",
      "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n",
      "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]\n",
      "Loading safetensors checkpoint shards:  20% Completed | 1/5 [00:00<00:03,  1.24it/s]\n",
      "Loading safetensors checkpoint shards:  40% Completed | 2/5 [00:03<00:04,  1.64s/it]\n",
      "Loading safetensors checkpoint shards:  60% Completed | 3/5 [00:05<00:03,  1.92s/it]\n",
      "Loading safetensors checkpoint shards:  80% Completed | 4/5 [00:08<00:02,  2.25s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00,  2.26s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00,  2.06s/it]\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 03-10 09:55:32 model_runner.py:1077] Loading model weights took 15.5028 GB\n",
      "INFO 03-10 09:55:42 worker.py:232] Memory profiling results: total_gpu_memory=31.73GiB initial_memory_usage=15.89GiB peak_torch_memory=19.42GiB memory_usage_post_profile=15.96GiB non_torch_memory=0.45GiB kv_cache_size=8.69GiB gpu_memory_utilization=0.90\n",
      "INFO 03-10 09:55:43 gpu_executor.py:113] # GPU blocks: 2965, # CPU blocks: 1365\n",
      "INFO 03-10 09:55:43 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 1.45x\n",
      "INFO 03-10 09:55:46 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
      "INFO 03-10 09:55:46 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
      "INFO 03-10 09:56:08 model_runner.py:1518] Graph capturing finished in 22 secs, took 0.59 GiB\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "# import time\n",
    "os.environ[\"NCCL_P2P_DISABLE\"] = \"1\"\n",
    "import torch \n",
    "# import pynvml\n",
    "from vllm import LLM, SamplingParams\n",
    "import torch.distributed as dist\n",
    "import re\n",
    "torch.cuda.empty_cache()\n",
    "os.environ[\"TRANSFORMERS_OFFLINE\"] = \"1\"\n",
    "os.environ[\"HF_DATASETS_OFFLINE\"] = \"1\"\n",
    "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
    "\n",
    "\n",
    "\n",
    "# 加载量化模型\n",
    "llm = LLM(model=\"/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8\", dtype=\"half\",gpu_memory_utilization=0.9)\n",
    "\n",
    "# \n",
    "sampling_params = SamplingParams(temperature=0.1, top_p=0.3, max_tokens = 256)\n",
    "\n",
    "# 推理示例\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n",
      "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
      "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n",
      "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n",
      "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n",
      "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]\n",
      "Loading safetensors checkpoint shards:  20% Completed | 1/5 [00:00<00:03,  1.24it/s]\n",
      "Loading safetensors checkpoint shards:  40% Completed | 2/5 [00:03<00:04,  1.64s/it]\n",
      "Loading safetensors checkpoint shards:  60% Completed | 3/5 [00:05<00:03,  1.92s/it]\n",
      "Loading safetensors checkpoint shards:  80% Completed | 4/5 [00:08<00:02,  2.25s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00,  2.26s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00,  2.06s/it]\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 03-10 09:55:32 model_runner.py:1077] Loading model weights took 15.5028 GB\n",
      "INFO 03-10 09:55:42 worker.py:232] Memory profiling results: total_gpu_memory=31.73GiB initial_memory_usage=15.89GiB peak_torch_memory=19.42GiB memory_usage_post_profile=15.96GiB non_torch_memory=0.45GiB kv_cache_size=8.69GiB gpu_memory_utilization=0.90\n",
      "INFO 03-10 09:55:43 gpu_executor.py:113] # GPU blocks: 2965, # CPU blocks: 1365\n",
      "INFO 03-10 09:55:43 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 1.45x\n",
      "INFO 03-10 09:55:46 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
      "INFO 03-10 09:55:46 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
      "INFO 03-10 09:56:08 model_runner.py:1518] Graph capturing finished in 22 secs, took 0.59 GiB\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "# import time\n",
    "os.environ[\"NCCL_P2P_DISABLE\"] = \"1\"\n",
    "import torch \n",
    "# import pynvml\n",
    "from vllm import LLM, SamplingParams\n",
    "import torch.distributed as dist\n",
    "import re\n",
    "torch.cuda.empty_cache()\n",
    "os.environ[\"TRANSFORMERS_OFFLINE\"] = \"1\"\n",
    "os.environ[\"HF_DATASETS_OFFLINE\"] = \"1\"\n",
    "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
    "\n",
    "\n",
    "\n",
    "# 加载量化模型\n",
    "llm = LLM(model=\"/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8\", dtype=\"half\",gpu_memory_utilization=0.9)\n",
    "\n",
    "# \n",
    "sampling_params = SamplingParams(temperature=0.1, top_p=0.3, max_tokens = 256)\n",
    "\n",
    "# 推理示例\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "初始化TF-IDF向量化器...\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理第 100/127 条记录...\n",
      "\n",
      "匹配完成，共处理 127 条记录\n",
      "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\n",
      "\n",
      "前3行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "最佳匹配: 患者姓名 (分数: 0.4840)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "最佳匹配: 患者性别 (分数: 0.5414)\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "最佳匹配: 患者年龄 (分数: 0.5357)\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 使用TF-IDF向量化文本\n",
    "    print(\"初始化TF-IDF向量化器...\")\n",
    "    \n",
    "    # 对中文文本进行分词处理\n",
    "    def tokenize_chinese(text):\n",
    "        return list(jieba.cut(text))\n",
    "    \n",
    "    # 初始化TF-IDF向量化器\n",
    "    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
    "    \n",
    "    # 定义TF-IDF相似度计算方法，只返回最佳匹配（分数最高的）\n",
    "    def calculate_best_tfidf_match(query, candidates, vectorizer):\n",
    "        \"\"\"计算TF-IDF相似度，只返回最佳匹配（分数最高的）\"\"\"\n",
    "        try:\n",
    "            # 将所有文本合并为一个列表进行向量化\n",
    "            all_texts = [query] + candidates\n",
    "            \n",
    "            # 拟合并转换所有文本\n",
    "            tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
    "            \n",
    "            # 计算查询与所有候选项的余弦相似度\n",
    "            query_vector = tfidf_matrix[0:1]\n",
    "            candidate_vectors = tfidf_matrix[1:]\n",
    "            \n",
    "            cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
    "            \n",
    "            # 找到分数最高的匹配\n",
    "            best_index = np.argmax(cosine_scores)\n",
    "            best_score = cosine_scores[best_index]\n",
    "            \n",
    "            return best_index, best_score\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF相似度计算失败: {e}\")\n",
    "            return -1, 0.0\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_data = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    \n",
    "    for i in range(len(test_df)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        \n",
    "        # 使用TF-IDF相似度匹配，只获取最佳匹配\n",
    "        best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n",
    "        \n",
    "        # 获取测试数据的相关字段\n",
    "        row = test_df.iloc[i]\n",
    "        \n",
    "        # 创建结果字典，包含原始字段\n",
    "        result_dict = {\n",
    "            'ParagraphName': row['ParagraphName'],\n",
    "            'StatementName': row['StatementName'],\n",
    "            'ValueItemName': row['ValueItemName'],\n",
    "            'DisplayString': row['DisplayString']\n",
    "        }\n",
    "        \n",
    "        # 添加id字段（如果存在）\n",
    "        if 'Id' in test_df.columns:\n",
    "            result_dict['Id'] = row['Id']\n",
    "        \n",
    "        # 添加SFZH, XGRQ, IPBLH字段（如果存在）\n",
    "        if 'SFZH' in test_df.columns:\n",
    "            result_dict['SFZH'] = row['SFZH']\n",
    "        if 'XGRQ' in test_df.columns:\n",
    "            result_dict['XGRQ'] = row['XGRQ']\n",
    "        if 'IPBLH' in test_df.columns:\n",
    "            result_dict['IPBLH'] = row['IPBLH']\n",
    "        \n",
    "        # 添加最佳TF-IDF匹配结果\n",
    "        if best_index >= 0:\n",
    "            result_dict['TFIDF_规范节点名'] = regular_df.iloc[best_index]['节点名']\n",
    "            result_dict['TFIDF_规范注释'] = regular_df.iloc[best_index]['注释']\n",
    "            result_dict['TFIDF_规范说明'] = regular_df.iloc[best_index]['说明']\n",
    "            result_dict['TFIDF_相似度分数'] = best_score\n",
    "        else:\n",
    "            result_dict['TFIDF_规范节点名'] = ''\n",
    "            result_dict['TFIDF_规范注释'] = ''\n",
    "            result_dict['TFIDF_规范说明'] = ''\n",
    "            result_dict['TFIDF_相似度分数'] = 0.0\n",
    "        \n",
    "        # 添加到结果数据\n",
    "        result_data.append(result_dict)\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_df = pd.DataFrame(result_data)\n",
    "    \n",
    "    # 保存结果\n",
    "    result_df.to_csv('/home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv', \n",
    "                     index=False, encoding=test_encoding)\n",
    "    \n",
    "    print(f\"\\n匹配完成，共处理 {len(test_df)} 条记录\")\n",
    "    print(f\"结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_tfidf_best_match.csv\")\n",
    "    \n",
    "    # 打印前3行匹配结果示例\n",
    "    print(\"\\n前3行匹配结果示例:\")\n",
    "    for i in range(min(3, len(result_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"最佳匹配: {result_df.iloc[i]['TFIDF_规范注释']} (分数: {result_df.iloc[i]['TFIDF_相似度分数']:.4f})\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "测试文件编码: GB2312\n",
      "规范文件编码: utf-8\n",
      "文件成功读取！\n",
      "\n",
      "测试文件的列名:\n",
      "['Id', 'PatientName', 'IPBLH', 'OPBLH', 'KH', 'KLX', 'SexId', 'CSRQ', 'ZJLX', 'SFZH', 'HYZK', 'ZYDM', 'GJDM', 'MZDM', 'JZDZ', 'YB', 'JG', 'HKDZ', 'DHHM', 'SJHM', 'ABOBloodTypeId', 'LXRXM', 'LXRGX', 'LXRDH', 'ZLLB', 'ZLMC', 'XGRQ', 'YJLXH', 'RYSJ', 'FolderName', 'Xh', 'RecordXh', 'FolderId', 'DocumentName', 'InstanceId', 'DocumentId', 'ParagraphId', 'ParagraphName', 'StatementId', 'StatementName', 'ValueId', 'ValueItemName', 'ValueItemKind', 'RealValue', 'ValueString', 'DisplayString', 'ValuePostfix', 'WSJLSCSJ', 'WSJLXGSJ', 'upload_time']\n",
      "\n",
      "初始化TF-IDF向量化器...\n",
      "开始匹配注释...\n",
      "处理第 0/127 条记录...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理第 100/127 条记录...\n",
      "\n",
      "匹配完成，共处理 127 条记录\n",
      "结果已保存至: /home/limeng/SICT/lung_test/result/喉癌患者测试样例_完整字段_tfidf_match.csv\n",
      "\n",
      "结果文件的列名:\n",
      "['Id', 'PatientName', 'IPBLH', 'OPBLH', 'KH', 'KLX', 'SexId', 'CSRQ', 'ZJLX', 'SFZH', 'HYZK', 'ZYDM', 'GJDM', 'MZDM', 'JZDZ', 'YB', 'JG', 'HKDZ', 'DHHM', 'SJHM', 'ABOBloodTypeId', 'LXRXM', 'LXRGX', 'LXRDH', 'ZLLB', 'ZLMC', 'XGRQ', 'YJLXH', 'RYSJ', 'FolderName', 'Xh', 'RecordXh', 'FolderId', 'DocumentName', 'InstanceId', 'DocumentId', 'ParagraphId', 'ParagraphName', 'StatementId', 'StatementName', 'ValueId', 'ValueItemName', 'ValueItemKind', 'RealValue', 'ValueString', 'DisplayString', 'ValuePostfix', 'WSJLSCSJ', 'WSJLXGSJ', 'upload_time', '规范节点名', '规范注释', '规范说明', 'processed_string']\n",
      "\n",
      "前3行匹配结果示例:\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-病人姓名\n",
      "最佳匹配: 患者姓名\n",
      "处理后字符串: 测试\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-性别\n",
      "最佳匹配: 患者性别\n",
      "处理后字符串: 女\n",
      "--------------------------------------------------\n",
      "原始字段: 病例特点-患者姓名、性别、年龄-年龄\n",
      "最佳匹配: 患者年龄\n",
      "处理后字符串: 22岁\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import chardet\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import jieba\n",
    "\n",
    "# 首先检测文件的实际编码\n",
    "def detect_encoding(file_path):\n",
    "    with open(file_path, 'rb') as f:\n",
    "        result = chardet.detect(f.read())\n",
    "    return result['encoding']\n",
    "\n",
    "# 检测文件编码\n",
    "test_file = '/home/limeng/SICT/lung_test/data/喉癌患者测试样例.csv'\n",
    "regular_file = '/home/limeng/SICT/lung_test/data/regular.csv'\n",
    "\n",
    "test_encoding = detect_encoding(test_file)\n",
    "regular_encoding = detect_encoding(regular_file)\n",
    "\n",
    "print(f\"测试文件编码: {test_encoding}\")\n",
    "print(f\"规范文件编码: {regular_encoding}\")\n",
    "\n",
    "# 尝试使用检测到的编码读取文件\n",
    "try:\n",
    "    # 读取规范文件\n",
    "    regular_df = pd.read_csv(regular_file, encoding=regular_encoding)\n",
    "    \n",
    "    # 读取测试数据\n",
    "    test_df = pd.read_csv(test_file, encoding=test_encoding)\n",
    "    \n",
    "    print(\"文件成功读取！\")\n",
    "except Exception as e:\n",
    "    print(f\"使用检测到的编码读取失败: {e}\")\n",
    "    \n",
    "    # 尝试其他常见编码\n",
    "    encodings = ['gbk', 'gb18030', 'latin1', 'cp936', 'big5']\n",
    "    \n",
    "    for enc in encodings:\n",
    "        try:\n",
    "            print(f\"尝试使用 {enc} 编码读取测试文件...\")\n",
    "            test_df = pd.read_csv(test_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取测试文件\")\n",
    "            \n",
    "            print(f\"尝试使用 {enc} 编码读取规范文件...\")\n",
    "            regular_df = pd.read_csv(regular_file, encoding=enc)\n",
    "            print(f\"成功使用 {enc} 读取规范文件\")\n",
    "            \n",
    "            test_encoding = enc\n",
    "            regular_encoding = enc\n",
    "            break\n",
    "        except Exception as e:\n",
    "            print(f\"使用 {enc} 读取失败: {e}\")\n",
    "\n",
    "# 如果成功读取文件，继续处理\n",
    "if 'test_df' in locals() and 'regular_df' in locals():\n",
    "    # 打印测试文件的列名，以供参考\n",
    "    print(\"\\n测试文件的列名:\")\n",
    "    print(test_df.columns.tolist())\n",
    "    \n",
    "    # 创建规范字典，键为注释，值为对应的规则\n",
    "    regular_annotations = regular_df['注释'].tolist()\n",
    "    \n",
    "    # 准备测试数据中的字段组合\n",
    "    combined_fields = []\n",
    "    \n",
    "    for _, row in test_df.iterrows():\n",
    "        combined_field = f\"{row['ParagraphName']}-{row['StatementName']}-{row['ValueItemName']}\"\n",
    "        combined_fields.append(combined_field)\n",
    "    \n",
    "    # 使用TF-IDF向量化文本\n",
    "    print(\"\\n初始化TF-IDF向量化器...\")\n",
    "    \n",
    "    # 对中文文本进行分词处理\n",
    "    def tokenize_chinese(text):\n",
    "        return list(jieba.cut(text))\n",
    "    \n",
    "    # 初始化TF-IDF向量化器\n",
    "    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_chinese, analyzer='word')\n",
    "    \n",
    "    # 定义TF-IDF相似度计算方法，只返回最佳匹配（分数最高的）\n",
    "    def calculate_best_tfidf_match(query, candidates, vectorizer):\n",
    "        \"\"\"计算TF-IDF相似度，只返回最佳匹配（分数最高的）\"\"\"\n",
    "        try:\n",
    "            # 将所有文本合并为一个列表进行向量化\n",
    "            all_texts = [query] + candidates\n",
    "            \n",
    "            # 拟合并转换所有文本\n",
    "            tfidf_matrix = vectorizer.fit_transform(all_texts)\n",
    "            \n",
    "            # 计算查询与所有候选项的余弦相似度\n",
    "            query_vector = tfidf_matrix[0:1]\n",
    "            candidate_vectors = tfidf_matrix[1:]\n",
    "            \n",
    "            cosine_scores = cosine_similarity(query_vector, candidate_vectors)[0]\n",
    "            \n",
    "            # 找到分数最高的匹配\n",
    "            best_index = np.argmax(cosine_scores)\n",
    "            best_score = cosine_scores[best_index]\n",
    "            \n",
    "            return best_index, best_score\n",
    "        except Exception as e:\n",
    "            print(f\"TF-IDF相似度计算失败: {e}\")\n",
    "            return -1, 0.0\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_data = []\n",
    "    \n",
    "    print(\"开始匹配注释...\")\n",
    "    \n",
    "    for i in range(len(test_df)):\n",
    "        if i % 100 == 0:\n",
    "            print(f\"处理第 {i}/{len(test_df)} 条记录...\")\n",
    "        \n",
    "        query = combined_fields[i]\n",
    "        \n",
    "        # 使用TF-IDF相似度匹配，只获取最佳匹配\n",
    "        best_index, best_score = calculate_best_tfidf_match(query, regular_annotations, tfidf_vectorizer)\n",
    "        \n",
    "        # 获取测试数据的行\n",
    "        row = test_df.iloc[i]\n",
    "        \n",
    "        # 创建结果字典，包含测试数据的所有字段\n",
    "        result_dict = row.to_dict()\n",
    "        \n",
    "        # 添加最佳TF-IDF匹配结果\n",
    "        if best_index >= 0:\n",
    "            result_dict['规范节点名'] = regular_df.iloc[best_index]['节点名']\n",
    "            result_dict['规范注释'] = regular_df.iloc[best_index]['注释']\n",
    "            result_dict['规范说明'] = regular_df.iloc[best_index]['说明']\n",
    "            \n",
    "            \n",
    "            # 从最佳匹配中提取processed_string值\n",
    "            # 如果需要对DisplayString进行处理，可以在此添加逻辑\n",
    "            result_dict['processed_string'] = row['DisplayString']\n",
    "        else:\n",
    "            result_dict['规范节点名'] = ''\n",
    "            result_dict['规范注释'] = ''\n",
    "            result_dict['规范说明'] = ''\n",
    "            \n",
    "            result_dict['processed_string'] = ''\n",
    "        \n",
    "        # 添加到结果数据\n",
    "        result_data.append(result_dict)\n",
    "    \n",
    "    # 创建结果DataFrame\n",
    "    result_df = pd.DataFrame(result_data)\n",
    "    \n",
    "    # 重新排列列顺序，将匹配结果放在后面\n",
    "    all_columns = test_df.columns.tolist() + ['规范节点名', '规范注释', '规范说明',  'processed_string']\n",
    "    result_df = result_df[all_columns]\n",
    "    \n",
    "    # 保存结果\n",
    "    result_file = '/home/limeng/SICT/lung_test/result/喉癌患者测试样例_完整字段_tfidf_match.csv'\n",
    "    result_df.to_csv(result_file, index=False, encoding=test_encoding)\n",
    "    \n",
    "    print(f\"\\n匹配完成，共处理 {len(test_df)} 条记录\")\n",
    "    print(f\"结果已保存至: {result_file}\")\n",
    "    \n",
    "    # 打印结果DataFrame的列名\n",
    "    print(\"\\n结果文件的列名:\")\n",
    "    print(result_df.columns.tolist())\n",
    "    \n",
    "    # 打印前3行匹配结果示例\n",
    "    print(\"\\n前3行匹配结果示例:\")\n",
    "    for i in range(min(3, len(result_df))):\n",
    "        print(f\"原始字段: {combined_fields[i]}\")\n",
    "        print(f\"最佳匹配: {result_df.iloc[i]['规范注释']}\")\n",
    "        print(f\"处理后字符串: {result_df.iloc[i]['processed_string']}\")\n",
    "        print(\"-\" * 50)\n",
    "else:\n",
    "    print(\"无法读取文件，请手动检查文件编码\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/limeng/anaconda3/envs/Qwen2.5/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2025-03-10 09:55:11,393\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 03-10 09:55:19 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n",
      "WARNING 03-10 09:55:19 config.py:428] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
      "INFO 03-10 09:55:19 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', speculative_config=None, tokenizer='/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n",
      "INFO 03-10 09:55:20 selector.py:261] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n",
      "INFO 03-10 09:55:20 selector.py:144] Using XFormers backend.\n",
      "INFO 03-10 09:55:20 model_runner.py:1072] Starting to load model /opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[W310 09:55:20.487166102 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]\n",
      "Loading safetensors checkpoint shards:  20% Completed | 1/5 [00:00<00:03,  1.24it/s]\n",
      "Loading safetensors checkpoint shards:  40% Completed | 2/5 [00:03<00:04,  1.64s/it]\n",
      "Loading safetensors checkpoint shards:  60% Completed | 3/5 [00:05<00:03,  1.92s/it]\n",
      "Loading safetensors checkpoint shards:  80% Completed | 4/5 [00:08<00:02,  2.25s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00,  2.26s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:10<00:00,  2.06s/it]\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 03-10 09:55:32 model_runner.py:1077] Loading model weights took 15.5028 GB\n",
      "INFO 03-10 09:55:42 worker.py:232] Memory profiling results: total_gpu_memory=31.73GiB initial_memory_usage=15.89GiB peak_torch_memory=19.42GiB memory_usage_post_profile=15.96GiB non_torch_memory=0.45GiB kv_cache_size=8.69GiB gpu_memory_utilization=0.90\n",
      "INFO 03-10 09:55:43 gpu_executor.py:113] # GPU blocks: 2965, # CPU blocks: 1365\n",
      "INFO 03-10 09:55:43 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 1.45x\n",
      "INFO 03-10 09:55:46 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
      "INFO 03-10 09:55:46 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
      "INFO 03-10 09:56:08 model_runner.py:1518] Graph capturing finished in 22 secs, took 0.59 GiB\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "# import time\n",
    "os.environ[\"NCCL_P2P_DISABLE\"] = \"1\"\n",
    "import torch \n",
    "# import pynvml\n",
    "from vllm import LLM, SamplingParams\n",
    "import torch.distributed as dist\n",
    "import re\n",
    "torch.cuda.empty_cache()\n",
    "os.environ[\"TRANSFORMERS_OFFLINE\"] = \"1\"\n",
    "os.environ[\"HF_DATASETS_OFFLINE\"] = \"1\"\n",
    "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
    "\n",
    "\n",
    "\n",
    "# 加载量化模型\n",
    "llm = LLM(model=\"/opt/lung/llm/Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8\", dtype=\"half\",gpu_memory_utilization=0.9)\n",
    "\n",
    "# \n",
    "sampling_params = SamplingParams(temperature=0.1, top_p=0.3, max_tokens = 256)\n",
    "\n",
    "# 推理示例\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Qwen2.5",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}