import numpy as np import pandas as pd from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer import pyLDAvis import pyLDAvis.sklearn # 读取excel数据 df = pd.read_excel('../datas/hebing.xlsx') text_column = df['合并咨询文本'] # 分词操作 # 文本向量化 # 创建一个CountVectorizer对象,将文本转换为单词计数矩阵 n_features = 1000 # 提取1000个特征词语 tf_vectorizer = CountVectorizer(strip_accents='unicode', max_features=n_features, stop_words='english', max_df=0.5, min_df=2) tf = tf_vectorizer.fit_transform(text_column) # text_column # 创建一个LatentDirichletAllocation对象,用于建模主题。 n_topics = 10 # 这里是设置LDA分类的主题个数,因为这里我们已经知道了每个内容的标签共有8个类型 lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50, learning_method='batch', learning_offset=50, doc_topic_prior=0.1, topic_word_prior=0.01, random_state=666) # 关于模型的参数,可查看官方文档 lda.fit(tf) # 输出每个主题对应的词语 def print_top_words(model, feature_names, n_top_words): tword = [] for topic_idx, topic in enumerate(model.components_): print(f"Topic #{topic_idx}:") topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) tword.append(topic_w) print(topic_w) return tword # 输出每个主题对应词语 n_top_words = 25 tf_feature_names = tf_vectorizer.get_feature_names_out() topic_word = print_top_words(lda, tf_feature_names, n_top_words) # 利用训练好的模型得出每篇文章对应的主题 topics = lda.transform(tf) print(topics[0]) # 查看第一篇文章的主题概率 topic = [] for t in topics: topic.append(list(t).index(np.max(t))) text_column['合并咨询文本'] = topic text_column.to_excel("../datas/LDA.xlsx", index=False) # 将结果保存为Excel文件 # 可视化模型 # pyLDAvis.enable_notebook() # pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer.get_stop_words()) # pyLDAvis.save_html(pic, 'lda_pass' + str(n_topics) + '.html') # 将可视化结果打包为html文件 # pyLDAvis.show(pic, local=False) # 利用困惑度在未知主题个数的时候通过可视化来确定 import matplotlib.pyplot as plt plexs = [] scores = [] n_max_topics = 16 # 这里值可自行设置 for i in range(1, n_max_topics): lda = LatentDirichletAllocation(n_components=i, max_iter=50, learning_method='batch', learning_offset=50, random_state=666) lda.fit(tf) plexs.append(lda.perplexity(tf)) scores.append(lda.score(tf)) n_t = 15 # 区间最右侧的值。注意:不能大于n_max_topics x = list(range(1, n_t)) plt.plot(x, plexs[1:n_t]) plt.xlabel("number of topics") plt.ylabel("perplexity") plt.show()