Spaces:
Runtime error
Runtime error
File size: 4,463 Bytes
6cf89af f364968 6cf89af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
"""
@author : Sakshi Tantak
"""
# Imports
import re
import string
import pickle
from time import time
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import emoji
from paths import COUNT_VECTORIZER_PATH, TFIDF_VECTORIZER_PATH, NB_MODEL_PATH as MODEL_PATH
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
stops = stopwords.words('english')
negatives = ['no','nor','not','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',
"hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",
'wasn',"wasn't",'weren',"weren't","won't",'wouldn',"wouldn't",'don',"don't"]
stops = set([stop for stop in stops if stop not in negatives])
lemmatizer = WordNetLemmatizer()
MODEL, COUNT_VECTORIZER, TFIDF = None, None, None
def clean_text(text):
text = re.sub(r'[\.]+', '.', text)
# print(text)
text = re.sub(r'[\!]+', '!', text)
# print(text)
text = re.sub(r'[\?]+', '!', text)
# print(text)
text = re.sub(r'\s+', ' ', text).strip().lower()
# print(text)
text = re.sub(r'@\w+', '', text).strip().lower()
# print(text)
text = re.sub(r'\s[n]+[o]+', ' no', text)
# print(text)
text = re.sub(r'n\'t', 'n not', text)
# print(text)
text = re.sub(r'\'nt', 'n not', text)
# print(text)
text = re.sub(r'\'re', ' are', text)
# print(text)
text = re.sub(r'\'s', ' is', text)
# print(text)
text = re.sub(r'\'d', ' would', text)
# print(text)
text = re.sub(r'\'ll', ' will', text)
# print(text)
text = re.sub(r'\'ve', ' have', text)
# print(text)
text = re.sub(r'\'m', ' am', text)
# print(text)
# map variations of nope to no
text = re.sub(r'\s[n]+[o]+[p]+[e]+', ' no', text)
# print(text)
# clean websites mentioned in text
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\~)*\b', '', text, flags=re.MULTILINE).strip()
# print(text)
text = re.sub(r'(www.)(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE).strip()
# print(text)
text = re.sub(r'\w+.com', '', text).strip()
# print(text)
text = emoji.demojize(text)
return text
def remove_punctuation(text):
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
text = text.translate(translator)
return re.sub(r'\s+', ' ', text).strip()
def remove_numbers(text):
return re.sub(r'[0-9]+', '', text)
def remove_stopwords_and_lemmatize(text):
tokens = word_tokenize(text)
tokens = [token.strip() for token in tokens if token.strip() not in stops]
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return ' '.join(tokens)
def load_model():
global MODEL, COUNT_VECTORIZER, TFIDF
if MODEL is None:
with open(MODEL_PATH, 'rb') as f:
print('Loading classifier ...')
start = time()
MODEL = pickle.load(f)
print(f'Time taken to load model = {time() - start}')
f.close()
if COUNT_VECTORIZER is None:
with open(COUNT_VECTORIZER_PATH, 'rb') as f:
print('Loading count vectorizer ...')
start = time()
COUNT_VECTORIZER = pickle.load(f)
print(f'Time taken to load count vectorizer = {time() - start}')
f.close()
if TFIDF is None:
with open(TFIDF_VECTORIZER_PATH, 'rb') as f:
print('Loading tfidf vectorizer ...')
start = time()
TFIDF = pickle.load(f)
print(f'Time taken to load tfidf vectorizer = {time() - start}')
f.close()
def predict(text):
if MODEL is None:
load_model()
text = clean_text(text)
text = remove_numbers(text)
text = remove_punctuation(text)
text = remove_stopwords_and_lemmatize(text)
vector = COUNT_VECTORIZER.transform([text]).toarray()
vector = TFIDF.transform(vector).toarray()
start = time()
prediction = MODEL.predict(vector)
print(prediction)
prediction = MODEL.predict(vector).item()
print(f'Inference time = {time() - start}')
return ('positive', 1) if prediction == 1 else ('negative', 1)
if __name__ == '__main__':
text = input('Enter tweet : ')
# text = "i am so bored!!!"
prediction = predict(text)
print(text, ' : ', prediction) |