import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 读取 Excel 文件
df = pd.read_excel('../datas/hebing.xlsx', sheet_name='Sheet1')

# 提取 text 列的多行数据
text_data = df['合并咨询文本'].tolist()

# 利用 TfidfVectorizer 进行关键词提取
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(text_data)
feature_names = vectorizer.get_feature_names_out()

# Get the indices of the top three features for each document
top_n = 3
top_features_indices = tfidf.toarray().argsort()[:, -top_n:]

# Get the feature names for the top three features
top_features_names = [[feature_names[idx] for idx in indices] for indices in top_features_indices]
print(top_features_names)