import pandas as pd
import thulac

# 加载THULAC模型 万级别 运行30分钟
thu = thulac.thulac(user_dict=None, model_path='E:/pythonTest/nlpTest/venv/Lib/site-packages/thulac/models')

# 读取Excel文件
df = pd.read_excel(r'../../data-dev/医生回答分词0510-handler.xlsx', sheet_name='回答分词')
word_lists = df['分词融合']

# 对列表中的每个词语列表进行词性分析
result = {}
for i, word_list in enumerate(word_lists):
    seg_list = thu.cut(word_list, text=True)
    pos_list = [pair[1] for pair in thu.cut(word_list)]
    result[i] = {"word_list": word_list, "seg_list": seg_list, "pos_list": pos_list}

    # 获取每个词语在原文本中的索引位置
    start_idx = 0
    end_idx = 0
    for pair in thu.cut(word_list):
        word, pos = pair
        start_idx = word_list.find(word, end_idx)
        end_idx = start_idx + len(word)
        result[i]["pos_index"] = result[i].get("pos_index", []) + [[start_idx, end_idx]]

output_df = pd.DataFrame.from_dict(result, orient="index")
output_df.to_excel("../../data-dev/医生分词结果词性表0511.xlsx", index=False)