import nltk # nltk.download() # 非常实用的文本处理工具,主要用于英文数据,历史悠久~ from nltk.tokenize import word_tokenize from nltk.text import Text from nltk.corpus import stopwords input_str = "Today's weather is good, very windy and sunny, we have no classes in the afternoon,Te have to play " \ "basketball tomorrow." # 分词操作 tokens = word_tokenize(input_str) tokens = [word.lower() for word in tokens] print(tokens) # 创建一个Text对象,方便操作 t = Text(tokens) t.count('good') t.index('good') # 展示前8个最多的词 t.plot(8) # 停用词 stopwords.readme().replace('\n', ' ') # 查看包含的语言 stopwords.fileids() # 观察停用词情况 stopwords.raw('english').replace('\n', ' ') # 变为集合 test_words = set(tokens)