import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import pyLDAvis
import pyLDAvis.sklearn

# 读取excel数据
df = pd.read_excel('../datas/hebing.xlsx')
text_column = df['合并咨询文本']

# 分词操作
# 文本向量化
# 创建一个CountVectorizer对象,将文本转换为单词计数矩阵
n_features = 1000  # 提取1000个特征词语
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df=0.5,
                                min_df=2)
tf = tf_vectorizer.fit_transform(text_column)

# text_column
# 创建一个LatentDirichletAllocation对象,用于建模主题。
n_topics = 10  # 这里是设置LDA分类的主题个数,因为这里我们已经知道了每个内容的标签共有8个类型
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='batch',
                                learning_offset=50,
                                doc_topic_prior=0.1,
                                topic_word_prior=0.01,
                                random_state=666)  # 关于模型的参数,可查看官方文档
lda.fit(tf)


# 输出每个主题对应的词语
def print_top_words(model, feature_names, n_top_words):
    tword = []
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        tword.append(topic_w)
        print(topic_w)
    return tword


# 输出每个主题对应词语
n_top_words = 25
tf_feature_names = tf_vectorizer.get_feature_names_out()
topic_word = print_top_words(lda, tf_feature_names, n_top_words)

# 利用训练好的模型得出每篇文章对应的主题
topics = lda.transform(tf)
print(topics[0])  # 查看第一篇文章的主题概率
topic = []
for t in topics:
    topic.append(list(t).index(np.max(t)))
text_column['合并咨询文本'] = topic
text_column.to_excel("../datas/LDA.xlsx", index=False)  # 将结果保存为Excel文件

# 可视化模型
# pyLDAvis.enable_notebook()
# pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer.get_stop_words())
# pyLDAvis.save_html(pic, 'lda_pass' + str(n_topics) + '.html')  # 将可视化结果打包为html文件
# pyLDAvis.show(pic, local=False)

# 利用困惑度在未知主题个数的时候通过可视化来确定
import matplotlib.pyplot as plt

plexs = []
scores = []
n_max_topics = 16  # 这里值可自行设置
for i in range(1, n_max_topics):
    lda = LatentDirichletAllocation(n_components=i, max_iter=50,
                                    learning_method='batch',
                                    learning_offset=50, random_state=666)
    lda.fit(tf)
    plexs.append(lda.perplexity(tf))
    scores.append(lda.score(tf))

n_t = 15  # 区间最右侧的值。注意:不能大于n_max_topics
x = list(range(1, n_t))
plt.plot(x, plexs[1:n_t])
plt.xlabel("number of topics")
plt.ylabel("perplexity")
plt.show()