pikaduck's picture
added download of nltk resources in predict_ml.py
f364968
"""
@author : Sakshi Tantak
"""
# Imports
import re
import string
import pickle
from time import time
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import emoji
from paths import COUNT_VECTORIZER_PATH, TFIDF_VECTORIZER_PATH, NB_MODEL_PATH as MODEL_PATH
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
stops = stopwords.words('english')
negatives = ['no','nor','not','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',
"hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",
'wasn',"wasn't",'weren',"weren't","won't",'wouldn',"wouldn't",'don',"don't"]
stops = set([stop for stop in stops if stop not in negatives])
lemmatizer = WordNetLemmatizer()
MODEL, COUNT_VECTORIZER, TFIDF = None, None, None
def clean_text(text):
text = re.sub(r'[\.]+', '.', text)
# print(text)
text = re.sub(r'[\!]+', '!', text)
# print(text)
text = re.sub(r'[\?]+', '!', text)
# print(text)
text = re.sub(r'\s+', ' ', text).strip().lower()
# print(text)
text = re.sub(r'@\w+', '', text).strip().lower()
# print(text)
text = re.sub(r'\s[n]+[o]+', ' no', text)
# print(text)
text = re.sub(r'n\'t', 'n not', text)
# print(text)
text = re.sub(r'\'nt', 'n not', text)
# print(text)
text = re.sub(r'\'re', ' are', text)
# print(text)
text = re.sub(r'\'s', ' is', text)
# print(text)
text = re.sub(r'\'d', ' would', text)
# print(text)
text = re.sub(r'\'ll', ' will', text)
# print(text)
text = re.sub(r'\'ve', ' have', text)
# print(text)
text = re.sub(r'\'m', ' am', text)
# print(text)
# map variations of nope to no
text = re.sub(r'\s[n]+[o]+[p]+[e]+', ' no', text)
# print(text)
# clean websites mentioned in text
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\~)*\b', '', text, flags=re.MULTILINE).strip()
# print(text)
text = re.sub(r'(www.)(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE).strip()
# print(text)
text = re.sub(r'\w+.com', '', text).strip()
# print(text)
text = emoji.demojize(text)
return text
def remove_punctuation(text):
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
text = text.translate(translator)
return re.sub(r'\s+', ' ', text).strip()
def remove_numbers(text):
return re.sub(r'[0-9]+', '', text)
def remove_stopwords_and_lemmatize(text):
tokens = word_tokenize(text)
tokens = [token.strip() for token in tokens if token.strip() not in stops]
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return ' '.join(tokens)
def load_model():
global MODEL, COUNT_VECTORIZER, TFIDF
if MODEL is None:
with open(MODEL_PATH, 'rb') as f:
print('Loading classifier ...')
start = time()
MODEL = pickle.load(f)
print(f'Time taken to load model = {time() - start}')
f.close()
if COUNT_VECTORIZER is None:
with open(COUNT_VECTORIZER_PATH, 'rb') as f:
print('Loading count vectorizer ...')
start = time()
COUNT_VECTORIZER = pickle.load(f)
print(f'Time taken to load count vectorizer = {time() - start}')
f.close()
if TFIDF is None:
with open(TFIDF_VECTORIZER_PATH, 'rb') as f:
print('Loading tfidf vectorizer ...')
start = time()
TFIDF = pickle.load(f)
print(f'Time taken to load tfidf vectorizer = {time() - start}')
f.close()
def predict(text):
if MODEL is None:
load_model()
text = clean_text(text)
text = remove_numbers(text)
text = remove_punctuation(text)
text = remove_stopwords_and_lemmatize(text)
vector = COUNT_VECTORIZER.transform([text]).toarray()
vector = TFIDF.transform(vector).toarray()
start = time()
prediction = MODEL.predict(vector)
print(prediction)
prediction = MODEL.predict(vector).item()
print(f'Inference time = {time() - start}')
return ('positive', 1) if prediction == 1 else ('negative', 1)
if __name__ == '__main__':
text = input('Enter tweet : ')
# text = "i am so bored!!!"
prediction = predict(text)
print(text, ' : ', prediction)