import pandas as pd import jieba.analyse from hanziconv import HanziConv df = pd.read_excel('../datas/duihua.xlsx', sheet_name='Sheet2') text = df['合并咨询文本'].tolist() # 中文停词,哈工大停词,四川机器智能实验室,百度停词 # 加医学方面的停词,提高模型的准确性:填写 stopwords = [line.strip() for line in open('../datas/hit_stopwords.txt', 'r', encoding='utf-8').readlines()] result = [] for line in text: seg_list = jieba.cut(HanziConv.toSimplified(line)) seg_list = [i for i in seg_list if i not in stopwords] result.append(' '.join(seg_list)) df_result = pd.DataFrame(result, columns=['合并咨询文本']) df_result.to_excel('../datas/duihua.xlsx', sheet_name='Sheet2', index=False)