import nltk

# nltk.download()
# 非常实用的文本处理工具，主要用于英文数据，历史悠久~
from nltk.tokenize import word_tokenize
from nltk.text import Text
from nltk.corpus import stopwords

input_str = "Today's weather is good, very windy and sunny, we have no classes in the afternoon,Te have to play " \
            "basketball tomorrow."

# 分词操作
tokens = word_tokenize(input_str)
tokens = [word.lower() for word in tokens]
print(tokens)
# 创建一个Text对象，方便操作
t = Text(tokens)
t.count('good')
t.index('good')
# 展示前8个最多的词
t.plot(8)

# 停用词
stopwords.readme().replace('\n', ' ')
# 查看包含的语言
stopwords.fileids()
# 观察停用词情况
stopwords.raw('english').replace('\n', ' ')
# 变为集合
test_words = set(tokens)