import pandas as pd import thulac # 加载THULAC模型 万级别 运行30分钟 thu = thulac.thulac(user_dict=None, model_path='E:/pythonTest/nlpTest/venv/Lib/site-packages/thulac/models') # 读取Excel文件 df = pd.read_excel(r'../../data-dev/医生回答分词0510-handler.xlsx', sheet_name='回答分词') word_lists = df['分词融合'] # 对列表中的每个词语列表进行词性分析 result = {} for i, word_list in enumerate(word_lists): seg_list = thu.cut(word_list, text=True) pos_list = [pair[1] for pair in thu.cut(word_list)] result[i] = {"word_list": word_list, "seg_list": seg_list, "pos_list": pos_list} # 获取每个词语在原文本中的索引位置 start_idx = 0 end_idx = 0 for pair in thu.cut(word_list): word, pos = pair start_idx = word_list.find(word, end_idx) end_idx = start_idx + len(word) result[i]["pos_index"] = result[i].get("pos_index", []) + [[start_idx, end_idx]] output_df = pd.DataFrame.from_dict(result, orient="index") output_df.to_excel("../../data-dev/医生分词结果词性表0511.xlsx", index=False)