Spaces:
Runtime error
Runtime error
""" | |
@author : Sakshi Tantak | |
""" | |
# Imports | |
import re | |
import string | |
import pickle | |
from time import time | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
import emoji | |
from paths import COUNT_VECTORIZER_PATH, TFIDF_VECTORIZER_PATH, NB_MODEL_PATH as MODEL_PATH | |
nltk.download('punkt') | |
nltk.download('omw-1.4') | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
stops = stopwords.words('english') | |
negatives = ['no','nor','not','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn', | |
"hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't", | |
'wasn',"wasn't",'weren',"weren't","won't",'wouldn',"wouldn't",'don',"don't"] | |
stops = set([stop for stop in stops if stop not in negatives]) | |
lemmatizer = WordNetLemmatizer() | |
MODEL, COUNT_VECTORIZER, TFIDF = None, None, None | |
def clean_text(text): | |
text = re.sub(r'[\.]+', '.', text) | |
# print(text) | |
text = re.sub(r'[\!]+', '!', text) | |
# print(text) | |
text = re.sub(r'[\?]+', '!', text) | |
# print(text) | |
text = re.sub(r'\s+', ' ', text).strip().lower() | |
# print(text) | |
text = re.sub(r'@\w+', '', text).strip().lower() | |
# print(text) | |
text = re.sub(r'\s[n]+[o]+', ' no', text) | |
# print(text) | |
text = re.sub(r'n\'t', 'n not', text) | |
# print(text) | |
text = re.sub(r'\'nt', 'n not', text) | |
# print(text) | |
text = re.sub(r'\'re', ' are', text) | |
# print(text) | |
text = re.sub(r'\'s', ' is', text) | |
# print(text) | |
text = re.sub(r'\'d', ' would', text) | |
# print(text) | |
text = re.sub(r'\'ll', ' will', text) | |
# print(text) | |
text = re.sub(r'\'ve', ' have', text) | |
# print(text) | |
text = re.sub(r'\'m', ' am', text) | |
# print(text) | |
# map variations of nope to no | |
text = re.sub(r'\s[n]+[o]+[p]+[e]+', ' no', text) | |
# print(text) | |
# clean websites mentioned in text | |
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\~)*\b', '', text, flags=re.MULTILINE).strip() | |
# print(text) | |
text = re.sub(r'(www.)(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE).strip() | |
# print(text) | |
text = re.sub(r'\w+.com', '', text).strip() | |
# print(text) | |
text = emoji.demojize(text) | |
return text | |
def remove_punctuation(text): | |
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) | |
text = text.translate(translator) | |
return re.sub(r'\s+', ' ', text).strip() | |
def remove_numbers(text): | |
return re.sub(r'[0-9]+', '', text) | |
def remove_stopwords_and_lemmatize(text): | |
tokens = word_tokenize(text) | |
tokens = [token.strip() for token in tokens if token.strip() not in stops] | |
tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
return ' '.join(tokens) | |
def load_model(): | |
global MODEL, COUNT_VECTORIZER, TFIDF | |
if MODEL is None: | |
with open(MODEL_PATH, 'rb') as f: | |
print('Loading classifier ...') | |
start = time() | |
MODEL = pickle.load(f) | |
print(f'Time taken to load model = {time() - start}') | |
f.close() | |
if COUNT_VECTORIZER is None: | |
with open(COUNT_VECTORIZER_PATH, 'rb') as f: | |
print('Loading count vectorizer ...') | |
start = time() | |
COUNT_VECTORIZER = pickle.load(f) | |
print(f'Time taken to load count vectorizer = {time() - start}') | |
f.close() | |
if TFIDF is None: | |
with open(TFIDF_VECTORIZER_PATH, 'rb') as f: | |
print('Loading tfidf vectorizer ...') | |
start = time() | |
TFIDF = pickle.load(f) | |
print(f'Time taken to load tfidf vectorizer = {time() - start}') | |
f.close() | |
def predict(text): | |
if MODEL is None: | |
load_model() | |
text = clean_text(text) | |
text = remove_numbers(text) | |
text = remove_punctuation(text) | |
text = remove_stopwords_and_lemmatize(text) | |
vector = COUNT_VECTORIZER.transform([text]).toarray() | |
vector = TFIDF.transform(vector).toarray() | |
start = time() | |
prediction = MODEL.predict(vector) | |
print(prediction) | |
prediction = MODEL.predict(vector).item() | |
print(f'Inference time = {time() - start}') | |
return ('positive', 1) if prediction == 1 else ('negative', 1) | |
if __name__ == '__main__': | |
text = input('Enter tweet : ') | |
# text = "i am so bored!!!" | |
prediction = predict(text) | |
print(text, ' : ', prediction) |