import nltk import numpy as np #nltk.download('all') from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() def tokenize(sentence): """ split sentence into array of words/tokens a token can be a word or punctuation character, or number """ return nltk.word_tokenize(sentence) def stem(word): return stemmer.stem(word.lower()) def bag_of_words(tokenized_sentence, all_words): tokenized_sentence = [stem(w) for w in tokenized_sentence] bag = np.zeros(len(all_words), dtype=np.float32) for idx, w in enumerate(all_words): if w in tokenized_sentence: bag[idx] = 1.0 return bag