# 停用词 - 停用词表 # 1.语料中大量出现 # 2.没啥大用 # tf-idf - 关键词提取 # tf:词频 * idf:逆文档频率 # 相似度 # 分词 - 语料库 - 词频 - 词频向量 # word2vec gensim # 余弦相似度 import pandas as pd import jieba import stopwords as stopwords from nltk.corpus import stopwords from pandas.compat import numpy df_news = pd.read_table('../datas/val.txt', names=['category', 'theme', 'URL', 'content'], encoding='utf-8') df_news = df_news.dropna() df_news.head() print(df_news) # 数据大小 df_news.shape # 分词:使用结吧分词器 content = df_news.content.values.tolist print(content[1000]) content_S = [] for line in content: current_segment = jieba.lcut(line) if len(current_segment) > 1 and current_segment != '\r\n': # 换行 content_S.append(current_segment) # 新的格式 df_content = pd.DataFrame({'content_S': content_S}) df_content.head # 停用词 read_csv = pd.read_csv("stopwords. txt", index_col=False, sep="\t", quoting=3, names=['stopword'], encoding='utf-8') stopwords.head() def drop_stopwords(contents, stopwords): contents_clean = [] all_words = [] for line in contents: line_clean = [] for word in line: if word in stopwords: continue line_clean.append(word) all_words.append(str(word)) contents_clean.append(line_clean) return contents_clean, all_words # print (contents_clean) contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist() contents_clean, all_words = drop_stopwords(contents, stopwords) # 去停用词后的情况 df_content = pd.DataFrame({'contents_clean': contents_clean}) df_content.head() # 去停用词后的统计 df_all_words = pd.DataFrame({'all_words': all_words}) df_all_words.head() # 语料库中词的数量 words_count = df_all_words.groupby(by=['all_words'])['all_words'].agg({"count": numpy.sizel}) words_count = words_count.reset_index().sort_values(by=["count"], ascending=False) words_count.head() # 词云 wordCloud # 提取关键词 tf-idf import jieba.analyse index = 1000 print(df_news['content'][index]) content_S_str = "".join(content_S[index]) print(" ".join(jieba.analyse.extract_tags(content_S_str, topk=5, withWeight=False))) # lda 主题模型 from gensim import corpora, models, similarities import gensim # 做映射,相当于词袋 蔡徐坤 1 练习生 2 你太美 3 dictionary = corpora.Dictionary[contents_clean] corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) # 类似k-means的簇 # 一号分类结果 print(lda.print_topic(1, topn=5)) # 打印每个词库前五个 for topic in lda.print_topics(num_topics=20, num_words=5): print(topic[1])