""" 看词频 不符合的词纳入停用词库 """ from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd df = pd.read_excel('../../data-dev/医生回答分词0510.xlsx', sheet_name='回答分词') # 获取分词后的文本列表 text_list = df['分词'].tolist() # 计算TF-IDF值 vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(text_list) # 获取单词与列索引的对应关系 vocab = vectorizer.vocabulary_ # 获取每个单词的逆文档频率 idf = vectorizer.idf_ # 计算每个单词的TF-IDF加权值并存储在字典中 word_scores = {} for word, index in vocab.items(): tf_idf = X[:, index].mean() word_scores[word] = tf_idf * idf[index] # 按照TF-IDF加权值从大到小排序并选出前50个单词 top_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:100] # 输出结果 for i, (word, score) in enumerate(top_words): print(f"{i+1}. {word}: {score}") # print(word)