prasanth345 commited on
Commit
2f7cb3f
·
verified ·
1 Parent(s): 2ac8202

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +69 -0
  2. model.pkl +3 -0
  3. vectorizer.pkl +3 -0
  4. ytcomments.py +349 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request
2
+ import pickle
3
+ import string
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+ from nltk.stem import PorterStemmer
7
+
8
+ nltk.download('stopwords')
9
+
10
+ app = Flask(__name__)
11
+
12
+ # Load the trained model and vectorizer
13
+ model = pickle.load(open('model.pkl', 'rb'))
14
+ vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
15
+
16
+ @app.route('/')
17
+ def home():
18
+ return render_template('index.html')
19
+
20
+ @app.route('/new-url', methods=['POST'])
21
+ def new_url_predict():
22
+ comment = request.form['comment']
23
+ processed_comment = preprocess_comment(comment)
24
+ features = vectorizer.transform([processed_comment])
25
+ prediction = model.predict(features)[0]
26
+ sentiment = get_sentiment_label(prediction)
27
+ return render_template('result.html', comment=comment, sentiment=sentiment)
28
+ @app.route('/predict', methods=['POST'])
29
+ def predict():
30
+ comment = request.form['comment']
31
+ processed_comment = preprocess_comment(comment)
32
+ features = vectorizer.transform([processed_comment])
33
+ prediction = model.predict(features)[0]
34
+ sentiment = get_sentiment_label(prediction)
35
+ return render_template('result.html', comment=comment, sentiment=sentiment)
36
+
37
+ def preprocess_comment(comment):
38
+ # Comment preprocessing code here
39
+ comment = comment.lower()
40
+ comment = comment.translate(str.maketrans('', '', string.punctuation))
41
+ comment = remove_stopwords(comment)
42
+ comment = stem_words(comment)
43
+ return comment
44
+
45
+ def remove_stopwords(comment):
46
+ stopwords_english = set(stopwords.words('english'))
47
+ comment_tokens = comment.split()
48
+ comment = ' '.join([word for word in comment_tokens if word not in stopwords_english])
49
+ return comment
50
+
51
+ def stem_words(comment):
52
+ stemmer = PorterStemmer()
53
+ comment_tokens = comment.split()
54
+ comment = ' '.join([stemmer.stem(word) for word in comment_tokens])
55
+ return comment
56
+
57
+ def get_sentiment_label(prediction):
58
+ if prediction == 0:
59
+ return 'negative '
60
+ elif prediction == 1:
61
+ return 'neutral '
62
+ elif prediction == 2:
63
+ return 'positive '
64
+ else:
65
+ return 'unknown'
66
+
67
+
68
+ if __name__ == '__main__':
69
+ app.run(debug=True, port=5001)
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3e12d9cad8afa3c46acae3adba5beeb1f3609c10e24e749c9b47fe93d44364a
3
+ size 72758
vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff764be18861f5abfab97327eff36f8b377d6813eff0f133e0894f4c2fea802
3
+ size 404491
ytcomments.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ytcomments.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1IAkt_1sG94cjURWKBvghkoK2KlZiYzX9
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import seaborn as sns
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.feature_extraction.text import CountVectorizer
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
+ from sklearn.linear_model import LogisticRegression
18
+ from sklearn.metrics import accuracy_score
19
+ from sklearn.preprocessing import LabelEncoder
20
+
21
+ df=pd.read_csv("/content/comments.csv")
22
+
23
+ df.head()
24
+
25
+ df=df.iloc[:,[2,4]]
26
+
27
+ df.head(2)
28
+
29
+ df.info()
30
+
31
+ df.isnull().sum()
32
+
33
+ df.dropna(axis=0,how='any',inplace=True)
34
+
35
+ df.columns
36
+
37
+ df['Sentiment']=df['Sentiment'].astype('int')
38
+
39
+ #######################################
40
+ #basic_preprocessing
41
+ #######################################
42
+
43
+ df['Comment']=df['Comment'].str.lower()
44
+
45
+ import string
46
+ string.punctuation
47
+
48
+ exclude=string.punctuation
49
+ def remove_punc(text):
50
+ for char in exclude:
51
+ text=text.replace(char,'')
52
+ return text
53
+
54
+ df['Comment']=df['Comment'].apply(remove_punc)
55
+
56
+ import nltk
57
+ nltk.download('stopwords')
58
+
59
+ from nltk.corpus import stopwords
60
+ stopwords.words('english')
61
+
62
+ def remove_stopwords(text):
63
+ new_text=[]
64
+ for word in text.split():
65
+ if word in stopwords.words('english'):
66
+ new_text.append('')
67
+ else:
68
+ new_text.append(word)
69
+ x=new_text[:]
70
+ new_text.clear()
71
+ return " ".join(x)
72
+
73
+ df['Comment']=df['Comment'].apply(remove_stopwords)
74
+
75
+ from nltk.stem.porter import PorterStemmer
76
+ ps=PorterStemmer()
77
+
78
+ def stem_words(text):
79
+ return " ".join([ps.stem(word) for word in text.split()])
80
+
81
+ df['Comment']=df['Comment'].apply(stem_words)
82
+
83
+ ####################################################
84
+ #EDA
85
+ ####################################################
86
+
87
+ plt.pie(df['Sentiment'].value_counts(), labels=['negative','neutral','positive'],autopct="%0.2f")
88
+ plt.show()
89
+
90
+ import nltk
91
+
92
+ nltk.download('punkt')
93
+
94
+ df['total_characters']=df['Comment'].apply(len)
95
+
96
+ df.head(2)
97
+
98
+ df['total_words'] = df['Comment'].apply(lambda x:len(nltk.word_tokenize(x)))
99
+
100
+ df.head(2)
101
+
102
+ df['total_sentences'] = df['Comment'].apply(lambda x:len(nltk.sent_tokenize(x)))
103
+
104
+ df.head(2)
105
+
106
+ df[['total_characters','total_sentences','total_words']].describe()
107
+
108
+ mask0=df['Sentiment']==0
109
+ mask1=df['Sentiment']==1
110
+ mask2=df['Sentiment']==2
111
+
112
+ df[mask0][['total_sentences','total_words','total_characters']].describe()
113
+
114
+ df[mask1][['total_sentences','total_words','total_characters']].describe()
115
+
116
+ df[mask2][['total_sentences','total_words','total_characters']].describe()
117
+
118
+ plt.figure(figsize=(12,6))
119
+ sns.histplot(df[df['Sentiment'] == 0]['total_characters'],color='green')
120
+ sns.histplot(df[df['Sentiment'] == 1]['total_characters'],color='red')
121
+ sns.histplot(df[df['Sentiment'] == 2]['total_characters'],color='pink')
122
+
123
+ plt.figure(figsize=(10,4))
124
+ sns.histplot(df[df['Sentiment'] == 0]['total_words'],color='green')
125
+ sns.histplot(df[df['Sentiment'] == 1]['total_words'],color='red')
126
+ sns.histplot(df[df['Sentiment'] == 2]['total_words'],color='pink')
127
+
128
+ sns.pairplot(df,hue='Sentiment')
129
+
130
+ sns.heatmap(df.corr(),annot=True)
131
+
132
+ from wordcloud import WordCloud
133
+ wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
134
+
135
+ negative_wc = wc.generate(df[df['Sentiment'] == 0]['Comment'].str.cat(sep=" "))
136
+
137
+ neutral_wc = wc.generate(df[df['Sentiment'] == 1]['Comment'].str.cat(sep=" "))
138
+
139
+ positive_wc = wc.generate(df[df['Sentiment'] == 2]['Comment'].str.cat(sep=" "))
140
+
141
+ plt.figure(figsize=(6,6))
142
+ plt.imshow(negative_wc)
143
+
144
+ plt.figure(figsize=(6,6))
145
+ plt.imshow(neutral_wc)
146
+
147
+ plt.figure(figsize=(6,6))
148
+ plt.imshow(positive_wc)
149
+
150
+ negative_corpus = []
151
+ for msg in df[df['Sentiment'] == 0]['Comment'].tolist():
152
+ for word in msg.split():
153
+ negative_corpus.append(word)
154
+
155
+ neutral_corpus = []
156
+ for msg in df[df['Sentiment'] == 1]['Comment'].tolist():
157
+ for word in msg.split():
158
+ neutral_corpus.append(word)
159
+
160
+ positive_corpus = []
161
+ for msg in df[df['Sentiment'] == 1]['Comment'].tolist():
162
+ for word in msg.split():
163
+ positive_corpus.append(word)
164
+
165
+ print(len(negative_corpus))
166
+ print(len(neutral_corpus))
167
+ print(len(positive_corpus))
168
+
169
+ from collections import Counter
170
+
171
+ pd.DataFrame(Counter(negative_corpus).most_common(30))
172
+
173
+
174
+
175
+
176
+
177
+ ##############################################
178
+ #bag of words#
179
+ ##############################################
180
+
181
+ cv=CountVectorizer(lowercase=True,stop_words='english',max_features=3000)
182
+ tfidf=TfidfVectorizer(max_features=3000)
183
+
184
+ features_cv=cv.fit_transform(df['Comment']).toarray()
185
+ features_tfidf=tfidf.fit_transform(df['Comment']).toarray()
186
+
187
+ type(features_cv)
188
+
189
+ features_cv
190
+
191
+ dict1 = pd.DataFrame(features_cv)
192
+ dict2 = pd.DataFrame(features_tfidf)
193
+
194
+ dict1.shape
195
+
196
+ df.columns
197
+
198
+ x_cv=dict1.iloc[:,:]
199
+ y_cv=df[['Sentiment']]
200
+
201
+ y_cv.columns
202
+
203
+ np.unique(y_cv)
204
+
205
+ x_tfidf=dict1.iloc[:,:]
206
+ y_tfidf=df.iloc[:,-1]
207
+
208
+ print(x_cv.shape)
209
+ print(x_tfidf.shape)
210
+
211
+ print(y_cv.shape)
212
+ print(y_tfidf.shape)
213
+
214
+ x_cv_train,x_cv_test,y_cv_train,y_cv_test=train_test_split(x_cv,y_cv,test_size=0.2)
215
+
216
+ y_cv_test.shape
217
+
218
+ x_tfidf_train,x_tfidf_test,y_tfidf_train,y_tfidf_test=train_test_split(x_tfidf,y_tfidf,test_size=0.2)
219
+
220
+ print(x_cv_train.shape)
221
+ print(y_cv_test.shape)
222
+ print(x_tfidf_train.shape)
223
+ print(y_tfidf_test.shape)
224
+
225
+ from sklearn.linear_model import LogisticRegression
226
+ from sklearn.svm import SVC
227
+ from sklearn.naive_bayes import MultinomialNB
228
+ from sklearn.tree import DecisionTreeClassifier
229
+ from sklearn.neighbors import KNeighborsClassifier
230
+ from sklearn.ensemble import RandomForestClassifier
231
+ from sklearn.ensemble import AdaBoostClassifier
232
+ from sklearn.ensemble import BaggingClassifier
233
+ from sklearn.ensemble import ExtraTreesClassifier
234
+ from sklearn.ensemble import GradientBoostingClassifier
235
+ from xgboost import XGBClassifier
236
+ from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
237
+ from sklearn.metrics import accuracy_score,precision_score
238
+
239
+ svc = SVC(kernel='sigmoid', gamma=1.0)
240
+ knc = KNeighborsClassifier()
241
+ mnb = MultinomialNB()
242
+ dtc = DecisionTreeClassifier(max_depth=5)
243
+ lrc = LogisticRegression(solver='liblinear', penalty='l1')
244
+ rfc = RandomForestClassifier(n_estimators=50, random_state=2)
245
+ abc = AdaBoostClassifier(n_estimators=50, random_state=2)
246
+ bc = BaggingClassifier(n_estimators=50, random_state=2)
247
+ etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
248
+ gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
249
+ xgb = XGBClassifier(n_estimators=50,random_state=2)
250
+
251
+ clfs = {
252
+ 'SVC' : svc,
253
+ 'KN' : knc,
254
+ 'NB': mnb,
255
+ 'DT': dtc,
256
+ 'LR': lrc,
257
+ 'RF': rfc,
258
+ 'AdaBoost': abc,
259
+ 'BgC': bc,
260
+ 'ETC': etc,
261
+ 'GBDT':gbdt,
262
+ 'xgb':xgb
263
+ }
264
+
265
+ def train_classifier(clf,X_train,y_train,X_test,y_test):
266
+ clf.fit(x_cv_train,y_cv_train)
267
+ y_cv_pred = clf.predict(x_cv_test)
268
+ accuracy = accuracy_score(y_cv_test,y_cv_pred)
269
+ precision = precision_score(y_cv_test,y_cv_pred,average='micro')
270
+
271
+ return accuracy,precision
272
+
273
+ #######################################################
274
+ #********************Counvectorizer*******************#
275
+ #######################################################
276
+
277
+ np.unique(y_cv_test)
278
+
279
+ accuracy_scores = []
280
+ precision_scores = []
281
+
282
+ for name,clf in clfs.items():
283
+
284
+ current_accuracy,current_precision = train_classifier(clf, x_cv_train,y_cv_train,x_cv_test,y_cv_test)
285
+
286
+ print("For ",name)
287
+ print("Accuracy - ",current_accuracy)
288
+ print("Precision - ",current_precision)
289
+
290
+ accuracy_scores.append(current_accuracy)
291
+ precision_scores.append(current_precision)
292
+
293
+ performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
294
+
295
+ performance_df
296
+
297
+ from sklearn.ensemble import VotingClassifier
298
+
299
+ svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
300
+ mnb = MultinomialNB()
301
+ etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
302
+
303
+ voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')
304
+
305
+ voting.fit(x_cv_train,y_cv_train)
306
+
307
+ y_pred = voting.predict(x_cv_test)
308
+
309
+ print("Accuracy",accuracy_score(y_cv_test,y_pred))
310
+ print("Precision",precision_score(y_cv_test,y_pred,average='micro'))
311
+
312
+
313
+
314
+ import pickle
315
+
316
+ pickle.dump(cv,open('vectorizer.pkl','wb'))
317
+
318
+ pickle.dump(lrc,open('model.pkl','wb'))
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+ gnb = GaussianNB()
327
+ mnb = MultinomialNB()
328
+ bnb = BernoulliNB()
329
+
330
+ gnb.fit(x_cv_train,y_cv_train)
331
+
332
+ pred_gnb=gnb.predict(x_cv_test)
333
+
334
+ print(accuracy_score(y_cv_test,pred_gnb))
335
+ print(precision_score(y_cv_test,pred_gnb,average='micro'))
336
+
337
+ mnb.fit(x_cv_train,y_cv_train)
338
+
339
+ pred_mnb=mnb.predict(x_cv_test)
340
+
341
+ print(accuracy_score(y_cv_test,pred_mnb))
342
+ print(precision_score(y_cv_test,pred_mnb,average='micro'))
343
+
344
+ bnb.fit(x_cv_train,y_cv_train)
345
+
346
+ pred_bnb=bnb.predict(x_cv_test)
347
+
348
+ print(accuracy_score(y_cv_test,pred_bnb))
349
+ print(precision_score(y_cv_test,pred_bnb,average='micro'))