# 停用词 - 停用词表
# 1.语料中大量出现
# 2.没啥大用

# tf-idf - 关键词提取
# tf：词频 * idf：逆文档频率

# 相似度
# 分词 - 语料库 - 词频 - 词频向量
# word2vec gensim
# 余弦相似度
import pandas as pd
import jieba
import stopwords as stopwords
from nltk.corpus import stopwords
from pandas.compat import numpy

df_news = pd.read_table('../datas/val.txt', names=['category', 'theme', 'URL', 'content'], encoding='utf-8')
df_news = df_news.dropna()
df_news.head()
print(df_news)

# 数据大小
df_news.shape

# 分词:使用结吧分词器
content = df_news.content.values.tolist
print(content[1000])
content_S = []
for line in content:
    current_segment = jieba.lcut(line)
    if len(current_segment) > 1 and current_segment != '\r\n':  # 换行
        content_S.append(current_segment)
# 新的格式
df_content = pd.DataFrame({'content_S': content_S})
df_content.head

# 停用词
read_csv = pd.read_csv("stopwords. txt", index_col=False, sep="\t", quoting=3, names=['stopword'], encoding='utf-8')
stopwords.head()


def drop_stopwords(contents, stopwords):
    contents_clean = []
    all_words = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
            all_words.append(str(word))
        contents_clean.append(line_clean)
    return contents_clean, all_words


# print (contents_clean)

contents = df_content.content_S.values.tolist()
stopwords = stopwords.stopword.values.tolist()
contents_clean, all_words = drop_stopwords(contents, stopwords)
# 去停用词后的情况
df_content = pd.DataFrame({'contents_clean': contents_clean})
df_content.head()
# 去停用词后的统计
df_all_words = pd.DataFrame({'all_words': all_words})
df_all_words.head()

# 语料库中词的数量
words_count = df_all_words.groupby(by=['all_words'])['all_words'].agg({"count": numpy.sizel})
words_count = words_count.reset_index().sort_values(by=["count"], ascending=False)
words_count.head()

# 词云 wordCloud
# 提取关键词 tf-idf
import jieba.analyse

index = 1000
print(df_news['content'][index])
content_S_str = "".join(content_S[index])
print(" ".join(jieba.analyse.extract_tags(content_S_str, topk=5, withWeight=False)))

# lda 主题模型
from gensim import corpora, models, similarities
import gensim

# 做映射，相当于词袋 蔡徐坤 1 练习生 2 你太美 3
dictionary = corpora.Dictionary[contents_clean]
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)  # 类似k-means的簇
# 一号分类结果
print(lda.print_topic(1, topn=5))
# 打印每个词库前五个
for topic in lda.print_topics(num_topics=20, num_words=5):
    print(topic[1])