from hanziconv import HanziConv import pandas as pd import jieba df = pd.read_excel('../Data/十种科室数据集.xlsx', sheet_name='Sheet1') text = df['ask'].tolist() stopwords = [line.strip() for line in open('../Data/hit_stopwords.txt', 'r', encoding='utf-8').readlines()] result = [] for line in text: # 精确模式,繁体转换 seg_list = jieba.cut(HanziConv.toSimplified(line)) # print(" ".join(seg_list)) seg_list = [i for i in seg_list if i not in stopwords] result.append(' '.join(seg_list)) df_result = pd.DataFrame({'分词': result, 'title': df.title}) df_result.to_excel('../Result/科室分词结果.xlsx', sheet_name='Sheet1', index=False)