File size: 4,463 Bytes
6cf89af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f364968
 
 
 
6cf89af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
    @author : Sakshi Tantak
"""

# Imports
import re
import string
import pickle
from time import time

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import emoji

from paths import COUNT_VECTORIZER_PATH, TFIDF_VECTORIZER_PATH, NB_MODEL_PATH as MODEL_PATH

nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

stops = stopwords.words('english')
negatives = ['no','nor','not','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',
  "hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",
  'wasn',"wasn't",'weren',"weren't","won't",'wouldn',"wouldn't",'don',"don't"]
stops = set([stop for stop in stops if stop not in negatives])

lemmatizer = WordNetLemmatizer()
MODEL, COUNT_VECTORIZER, TFIDF = None, None, None

def clean_text(text):
    text = re.sub(r'[\.]+', '.', text)
    # print(text)
    text = re.sub(r'[\!]+', '!', text)
    # print(text)
    text = re.sub(r'[\?]+', '!', text)
    # print(text)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    # print(text)
    text = re.sub(r'@\w+', '', text).strip().lower()
    # print(text)
    text = re.sub(r'\s[n]+[o]+', ' no', text)
    # print(text)
    text = re.sub(r'n\'t', 'n not', text)
    # print(text)
    text = re.sub(r'\'nt', 'n not', text)
    # print(text)
    text = re.sub(r'\'re', ' are', text)
    # print(text)
    text = re.sub(r'\'s', ' is', text)
    # print(text)
    text = re.sub(r'\'d', ' would', text)
    # print(text)
    text = re.sub(r'\'ll', ' will', text)
    # print(text)
    text = re.sub(r'\'ve', ' have', text)
    # print(text)
    text = re.sub(r'\'m', ' am', text)
    # print(text)
    # map variations of nope to no
    text = re.sub(r'\s[n]+[o]+[p]+[e]+', ' no', text)
    # print(text)
    # clean websites mentioned in text
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\~)*\b', '', text, flags=re.MULTILINE).strip()
    # print(text)
    text = re.sub(r'(www.)(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE).strip()
    # print(text)
    text = re.sub(r'\w+.com', '', text).strip()
    # print(text)
    text = emoji.demojize(text)
    return text

def remove_punctuation(text):
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(translator)
    return re.sub(r'\s+', ' ', text).strip()

def remove_numbers(text):
  return re.sub(r'[0-9]+', '', text)

def remove_stopwords_and_lemmatize(text):
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens if token.strip() not in stops]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

def load_model():
    global MODEL, COUNT_VECTORIZER, TFIDF

    if MODEL is None:
        with open(MODEL_PATH, 'rb') as f:
            print('Loading classifier ...')
            start = time()
            MODEL = pickle.load(f)
            print(f'Time taken to load model = {time() - start}')
        f.close()

    if COUNT_VECTORIZER is None:
        with open(COUNT_VECTORIZER_PATH, 'rb') as f:
            print('Loading count vectorizer ...')
            start = time()
            COUNT_VECTORIZER = pickle.load(f)
            print(f'Time taken to load count vectorizer = {time() - start}')
        f.close()

    if TFIDF is None:
        with open(TFIDF_VECTORIZER_PATH, 'rb') as f:
            print('Loading tfidf vectorizer ...')
            start = time()
            TFIDF = pickle.load(f)
            print(f'Time taken to load tfidf vectorizer = {time() - start}')
        f.close()

def predict(text):
    if MODEL is None:
        load_model()

    text = clean_text(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_stopwords_and_lemmatize(text)

    vector = COUNT_VECTORIZER.transform([text]).toarray()
    vector = TFIDF.transform(vector).toarray()
    start = time()
    prediction = MODEL.predict(vector)
    print(prediction)
    prediction = MODEL.predict(vector).item()
    print(f'Inference time = {time() - start}')
    return ('positive', 1) if prediction == 1 else ('negative', 1)

if __name__ == '__main__':
    text = input('Enter tweet : ')
    # text = "i am so bored!!!"
    prediction = predict(text)
    print(text, ' : ', prediction)