Pranav-K commited on
Commit
08722d4
1 Parent(s): a31ae5b

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +826 -0
  2. gen-data.csv +0 -0
  3. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Survey_Analysis_v_3.2.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1UtAdINgLRkpdKGCzhEIPR8ZgK1u_dMtD
8
+ """
9
+
10
+ #1 - https://www.kaggle.com/code/ramjasmaurya/financial-sentiment-analysis
11
+ #2 - https://www.kaggle.com/code/adarshbiradar/sentiment-analysis-using-bert
12
+
13
+ pip install streamlit
14
+
15
+ import streamlit
16
+
17
+ pip install pygal
18
+
19
+ !pip install squarify
20
+
21
+ # Commented out IPython magic to ensure Python compatibility.
22
+ import numpy as np
23
+ import pandas as pd
24
+ import seaborn as sns
25
+ import matplotlib.pyplot as plt
26
+ import plotly.express as px
27
+ import plotly.graph_objects as go
28
+
29
+
30
+ import pygal as py
31
+ import squarify as sq
32
+ import matplotlib
33
+ plt.rcParams["figure.figsize"] = (20,15)
34
+ matplotlib.rc('xtick', labelsize=7)
35
+ matplotlib.rc('ytick', labelsize=7)
36
+
37
+ font = {'family' : 'normal',
38
+ 'weight' : 'bold',
39
+ 'size' : 5}
40
+
41
+ matplotlib.rc('font', **font)
42
+ from sklearn.feature_extraction.text import CountVectorizer
43
+ import warnings
44
+ warnings.filterwarnings("ignore", category=FutureWarning)
45
+ # %matplotlib inline
46
+
47
+ df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1")
48
+ df
49
+
50
+ col1=df.keys()[0]
51
+ col2=df.keys()[1]
52
+ col2
53
+
54
+ df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845])
55
+
56
+ df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False)
57
+
58
+ df
59
+
60
+ df = df.replace("neutral","neutral")
61
+
62
+ sns.countplot(y="sentiment",data=df)
63
+
64
+ df.isnull().sum()
65
+
66
+ from textblob import TextBlob
67
+
68
+ def preprocess(ReviewText):
69
+ ReviewText = ReviewText.str.replace("(<br/>)", "")
70
+ ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
71
+ ReviewText = ReviewText.str.replace('(&amp)', '')
72
+ ReviewText = ReviewText.str.replace('(&gt)', '')
73
+ ReviewText = ReviewText.str.replace('(&lt)', '')
74
+ ReviewText = ReviewText.str.replace('(\xa0)', ' ')
75
+ return ReviewText
76
+ df['Review Text'] = preprocess(df['news'])
77
+
78
+ df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity)
79
+ df['news_len'] = df['news'].astype(str).apply(len)
80
+ df['word_count'] = df['news'].apply(lambda x: len(str(x).split()))
81
+
82
+ df
83
+
84
+ print('top 4 random reviews with the highest positive sentiment polarity: \n')
85
+
86
+ df1=df.drop_duplicates(subset=['Review Text'])
87
+
88
+ cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values
89
+ for c in cl:
90
+ print(c[0])
91
+
92
+ print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
93
+ cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
94
+ for c in cl1:
95
+ print(c[0])
96
+
97
+ print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n')
98
+ cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values
99
+ for c in cl3:
100
+ print(c[0])
101
+
102
+ sns.boxplot(df["polarity"],palette="rainbow",data=df)
103
+
104
+ df['polarity'].plot(
105
+ kind='hist',
106
+ bins=50,
107
+ color="peru",
108
+ title='Sentiment Polarity Distribution');plt.show()
109
+
110
+ p_s=df[df["polarity"]>0].count()["sentiment"]
111
+ neu_s=df[df["polarity"]==0].count()["sentiment"]
112
+ neg_s=df[df["polarity"]<0].count()["sentiment"]
113
+
114
+ # Setting labels for items in Chart
115
+ sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"]
116
+
117
+ # Setting size in Chart based on
118
+ # given values
119
+ values = [p_s,neu_s,neg_s]
120
+
121
+ # colors
122
+ colors = ['#FF0000', 'olive', '#FFFF00']
123
+ # explosion
124
+ explode = (0.05, 0.05, 0.05)
125
+
126
+ # Pie Chart
127
+ plt.pie(values, colors=colors, labels=sentiment,
128
+ autopct='%1.1f%%', pctdistance=0.85,
129
+ explode=explode)
130
+
131
+ # draw circle
132
+ centre_circle = plt.Circle((0, 0), 0.70, fc='white')
133
+ fig = plt.gcf()
134
+
135
+ # Adding Circle in Pie chart
136
+ fig.gca().add_artist(centre_circle)
137
+
138
+ # Adding Title of chart
139
+ plt.title('count of polarity as per sentiment')
140
+
141
+ # Displaing Chart
142
+ plt.show()
143
+
144
+ df.plot.box(y=["word_count"],color="hotpink")
145
+
146
+ df['word_count'].plot(
147
+ kind='hist',
148
+ bins=100,
149
+ color="orange",
150
+ title='Review Text Word Count Distribution');plt.show()
151
+
152
+ sns.boxenplot(x="news_len",data=df)
153
+ plt.show()
154
+
155
+ df['news_len'].plot(
156
+ kind='hist',
157
+ bins=50,
158
+ color="lightblue",
159
+ title='Review Text Word Count Distribution');plt.show()
160
+
161
+ fig = px.scatter(df, x="news_len", y="word_count", color="sentiment",
162
+ marginal_x="box", marginal_y="violin",
163
+ title="Click on the legend items!")
164
+ fig.show()
165
+
166
+ def get_top_n_words(corpus, n=None):
167
+ vec = CountVectorizer().fit(corpus)
168
+ bag_of_words = vec.transform(corpus)
169
+ sum_words = bag_of_words.sum(axis=0)
170
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
171
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
172
+ return words_freq[:n]
173
+ common_words = get_top_n_words(df['Review Text'], 20)
174
+ for word, freq in common_words:
175
+ print(word, freq)
176
+ df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
177
+ df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
178
+ kind='bar',title='Top 20 words in review before removing stop words')
179
+ df1
180
+
181
+ def get_top_n_words(corpus, n=None):
182
+ vec = CountVectorizer(stop_words = 'english').fit(corpus)
183
+ bag_of_words = vec.transform(corpus)
184
+ sum_words = bag_of_words.sum(axis=0)
185
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
186
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
187
+ return words_freq[:n]
188
+ common_words = get_top_n_words(df['Review Text'], 20)
189
+ for word, freq in common_words:
190
+ print(word, freq)
191
+ df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
192
+ df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words')
193
+
194
+ def get_top_n_bigram(corpus, n=None):
195
+ vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
196
+ bag_of_words = vec.transform(corpus)
197
+ sum_words = bag_of_words.sum(axis=0)
198
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
199
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
200
+ return words_freq[:n]
201
+ common_words = get_top_n_bigram(df['Review Text'], 20)
202
+ for word, freq in common_words:
203
+ print(word, freq)
204
+ df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
205
+ df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
206
+ kind='bar',title='Top 20 bigrams in review before removing stop words')
207
+
208
+ def get_top_n_bigram(corpus, n=None):
209
+ vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
210
+ bag_of_words = vec.transform(corpus)
211
+ sum_words = bag_of_words.sum(axis=0)
212
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
213
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
214
+ return words_freq[:n]
215
+ common_words = get_top_n_bigram(df['Review Text'], 20)
216
+ for word, freq in common_words:
217
+ print(word, freq)
218
+ df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
219
+ df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
220
+ kind='bar', title='Top 20 bigrams in review after removing stop words')
221
+
222
+ def get_top_n_trigram(corpus, n=None):
223
+ vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
224
+ bag_of_words = vec.transform(corpus)
225
+ sum_words = bag_of_words.sum(axis=0)
226
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
227
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
228
+ return words_freq[:n]
229
+ common_words = get_top_n_trigram(df['Review Text'], 20)
230
+ for word, freq in common_words:
231
+ print(word, freq)
232
+ df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
233
+ df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
234
+ kind='bar', title='Top 20 trigrams in review before removing stop words')
235
+
236
+ def get_top_n_trigram(corpus, n=None):
237
+ vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
238
+ bag_of_words = vec.transform(corpus)
239
+ sum_words = bag_of_words.sum(axis=0)
240
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
241
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
242
+ return words_freq[:n]
243
+ common_words = get_top_n_trigram(df['Review Text'], 20)
244
+ for word, freq in common_words:
245
+ print(word, freq)
246
+ df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count'])
247
+ df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
248
+ kind='bar', title='Top 20 trigrams in review after removing stop words')
249
+
250
+ import nltk
251
+ nltk.download('punkt')
252
+ nltk.download('wordnet')
253
+ nltk.download('omw-1.4')
254
+ nltk.download('averaged_perceptron_tagger')
255
+
256
+ #import nltk
257
+ blob = TextBlob(str(df['Review Text']))
258
+ pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
259
+ pos_df = pos_df.pos.value_counts()[:20]
260
+ pos_df.plot(
261
+ kind='bar',
262
+ title='Top 20 Part-of-speech tagging for review corpus')
263
+
264
+ y0 = df.loc[df['sentiment'] == 'positive']['polarity']
265
+ y1 = df.loc[df['sentiment'] == 'negative']['polarity']
266
+ y2 = df.loc[df['sentiment'] == 'neutral']['polarity']
267
+
268
+ trace0 = go.Box(
269
+ y=y0,
270
+ name = 'positive',
271
+ marker = dict(
272
+ color = 'rgb(214, 12, 140)',
273
+ )
274
+ )
275
+ trace1 = go.Box(
276
+ y=y1,
277
+ name = 'negative',
278
+ marker = dict(
279
+ color = 'rgb(0, 128, 128)',
280
+ )
281
+ )
282
+ trace2 = go.Box(
283
+ y=y2,
284
+ name = 'neutral',
285
+ marker = dict(
286
+ color = 'rgb(10, 140, 208)',
287
+ )
288
+ )
289
+ data = [trace0, trace1, trace2]
290
+ layout = go.Layout(
291
+ title = "Polarity Boxplot according to sentiment"
292
+ )
293
+
294
+ go.Figure(data=data,layout=layout)
295
+
296
+ y0 = df.loc[df['sentiment'] == 'positive']['news_len']
297
+ y1 = df.loc[df['sentiment'] == 'negative']['news_len']
298
+ y2 = df.loc[df['sentiment'] == 'neutral']['news_len']
299
+
300
+
301
+ trace0 = go.Box(
302
+ y=y0,
303
+ name = 'positive',
304
+ marker = dict(
305
+ color = 'rgb(214, 12, 140)',
306
+ )
307
+ )
308
+ trace1 = go.Box(
309
+ y=y1,
310
+ name = 'negative',
311
+ marker = dict(
312
+ color = 'rgb(0, 128, 128)',
313
+ )
314
+ )
315
+ trace2 = go.Box(
316
+ y=y2,
317
+ name = 'neutral',
318
+ marker = dict(
319
+ color = 'rgb(10, 140, 208)',
320
+ )
321
+ )
322
+ data = [trace0, trace1, trace2]
323
+ layout = go.Layout(
324
+ title = "news length Boxplot by sentiment"
325
+ )
326
+ go.Figure(data=data,layout=layout)
327
+
328
+ xp = df.loc[df['sentiment'] == "positive", 'polarity']
329
+ xneu = df.loc[df['sentiment'] == "neutral", 'polarity']
330
+ xneg= df.loc[df['sentiment'] == "negative", 'polarity']
331
+
332
+ trace1 = go.Histogram(
333
+ x=xp, name='positive',
334
+ opacity=0.75
335
+ )
336
+ trace2 = go.Histogram(
337
+ x=xneu, name = 'neutral',
338
+ opacity=0.75
339
+ )
340
+ trace3 = go.Histogram(
341
+ x=xneg, name = 'negative',
342
+ opacity=0.75
343
+ )
344
+ data = [trace1, trace2,trace3]
345
+ layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity')
346
+ go.Figure(data=data, layout=layout)
347
+
348
+ trace1 = go.Scatter(
349
+ x=df['polarity'], y=df['news_len'], mode='markers', name='points',
350
+ marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
351
+ )
352
+ trace2 = go.Histogram2dContour(
353
+ x=df['polarity'], y=df['news_len'], name='density', ncontours=50,
354
+ colorscale='Hot', reversescale=True, showscale=False
355
+ )
356
+ trace3 = go.Histogram(
357
+ x=df['polarity'], name='Sentiment polarity density',
358
+ marker=dict(color='rgb(102,0,0)'),
359
+ yaxis='y2'
360
+ )
361
+ trace4 = go.Histogram(
362
+ y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'),
363
+ xaxis='x2'
364
+ )
365
+ data = [trace1, trace2, trace3, trace4]
366
+
367
+ layout = go.Layout(
368
+ showlegend=False,
369
+ autosize=False,
370
+ width=600,
371
+ height=550,
372
+ xaxis=dict(
373
+ domain=[0, 0.85],
374
+ showgrid=False,
375
+ zeroline=False
376
+ ),
377
+ yaxis=dict(
378
+ domain=[0, 0.85],
379
+ showgrid=False,
380
+ zeroline=False
381
+ ),
382
+ margin=dict(
383
+ t=50
384
+ ),
385
+ hovermode='x unified',
386
+ bargap=0,
387
+ xaxis2=dict(
388
+ domain=[0.85, 1],
389
+ showgrid=False,
390
+ zeroline=False
391
+ ),
392
+ yaxis2=dict(
393
+ domain=[0.85, 1],
394
+ showgrid=False,
395
+ zeroline=False
396
+ )
397
+ )
398
+
399
+ go.Figure(data=data, layout=layout)
400
+
401
+ trace1 = go.Scatter(
402
+ x=df['polarity'], y=df['word_count'], mode='markers', name='points',
403
+ marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
404
+ )
405
+ trace2 = go.Histogram2dContour(
406
+ x=df['polarity'], y=df['word_count'], name='density', ncontours=20,
407
+ colorscale='Hot', reversescale=True, showscale=False
408
+ )
409
+ trace3 = go.Histogram(
410
+ x=df['polarity'], name='Sentiment polarity density',
411
+ marker=dict(color='rgb(102,0,0)'),
412
+ yaxis='y2'
413
+ )
414
+ trace4 = go.Histogram(
415
+ y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'),
416
+ xaxis='x2'
417
+ )
418
+ data = [trace1, trace2, trace3, trace4]
419
+
420
+ layout = go.Layout(
421
+ showlegend=False,
422
+ autosize=False,
423
+ width=600,
424
+ height=550,
425
+ xaxis=dict(
426
+ domain=[0, 0.85],
427
+ showgrid=False,
428
+ zeroline=False
429
+ ),
430
+ yaxis=dict(
431
+ domain=[0, 0.85],
432
+ showgrid=False,
433
+ zeroline=False
434
+ ),
435
+ margin=dict(
436
+ t=50
437
+ ),
438
+ hovermode='closest',
439
+ bargap=0,
440
+ xaxis2=dict(
441
+ domain=[0.85, 1],
442
+ showgrid=False,
443
+ zeroline=False
444
+ ),
445
+ yaxis2=dict(
446
+ domain=[0.85, 1],
447
+ showgrid=False,
448
+ zeroline=False
449
+ )
450
+ )
451
+
452
+ go.Figure(data=data, layout=layout)
453
+
454
+ pip install scattertext
455
+
456
+ pip install spacy
457
+
458
+ import scattertext as st
459
+ import spacy
460
+ nlp = spacy.blank("en")
461
+ nlp.add_pipe('sentencizer')
462
+ #nlp.add_pipe(nlp.create_pipe('sentencizer'))
463
+ corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build()
464
+ print(list(corpus.get_scaled_f_scores_vs_background().index[:20]))
465
+
466
+ term_freq_df = corpus.get_term_freq_df()
467
+ term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive')
468
+ list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20])
469
+
470
+ term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral')
471
+ list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20])
472
+
473
+ term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative')
474
+ list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20])
475
+
476
+ from sklearn.feature_extraction.text import TfidfVectorizer
477
+ from sklearn.decomposition import TruncatedSVD
478
+ from collections import Counter
479
+
480
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
481
+ reindexed_data = df['Review Text'].values
482
+ document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
483
+ n_topics = 10
484
+ lsa_model = TruncatedSVD(n_components=n_topics)
485
+ lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)
486
+
487
+ def get_keys(topic_matrix):
488
+ '''
489
+ returns an integer list of predicted topic
490
+ categories for a given topic matrix
491
+ '''
492
+ keys = topic_matrix.argmax(axis=1).tolist()
493
+ return keys
494
+
495
+ def keys_to_counts(keys):
496
+ '''
497
+ returns a tuple of topic categories and their
498
+ accompanying magnitudes for a given list of keys
499
+ '''
500
+ count_pairs = Counter(keys).items()
501
+ categories = [pair[0] for pair in count_pairs]
502
+ counts = [pair[1] for pair in count_pairs]
503
+ return (categories, counts)
504
+
505
+ lsa_keys = get_keys(lsa_topic_matrix)
506
+ lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
507
+
508
+ def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
509
+ '''
510
+ returns a list of n_topic strings, where each string contains the n most common
511
+ words in a predicted category, in order
512
+ '''
513
+ top_word_indices = []
514
+ for topic in range(n_topics):
515
+ temp_vector_sum = 0
516
+ for i in range(len(keys)):
517
+ if keys[i] == topic:
518
+ temp_vector_sum += document_term_matrix[i]
519
+ temp_vector_sum = temp_vector_sum.toarray()
520
+ top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
521
+ top_word_indices.append(top_n_word_indices)
522
+ top_words = []
523
+ for topic in top_word_indices:
524
+ topic_words = []
525
+ for index in topic:
526
+ temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
527
+ temp_word_vector[:,index] = 1
528
+ the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
529
+ topic_words.append(the_word.encode('ascii').decode('utf-8'))
530
+ top_words.append(" ".join(topic_words))
531
+ return top_words
532
+
533
+ top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
534
+
535
+ for i in range(len(top_lsa)):
536
+ print("Topic {}: ".format(i+1), top_lsa[i])
537
+
538
+ top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
539
+ labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories]
540
+ fig, ax = plt.subplots(figsize=(16,8))
541
+ ax.bar(lsa_categories, lsa_counts,color="skyblue");
542
+ ax.set_xticks(lsa_categories,);
543
+ ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive");
544
+ ax.set_ylabel('Number of review text on topics');
545
+ ax.set_title('Count of LSA topics');
546
+ plt.show();
547
+
548
+ """#---2----"""
549
+
550
+ df['sentiment'].value_counts()
551
+
552
+ from sklearn.model_selection import train_test_split
553
+ train,eva = train_test_split(df,test_size = 0.2)
554
+
555
+ !pip install simpletransformers
556
+
557
+ from simpletransformers.classification import ClassificationModel
558
+
559
+ # Create a Transformer Model BERT
560
+ model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)
561
+
562
+ # 0,1,2 : positive,negative
563
+ def making_label(st):
564
+ if(st=='positive'):
565
+ return 0
566
+ elif(st=='neutral'):
567
+ return 2
568
+ else:
569
+ return 1
570
+
571
+ train['label'] = train['sentiment'].apply(making_label)
572
+ eva['label'] = eva['sentiment'].apply(making_label)
573
+ print(train.shape)
574
+
575
+ train_df = pd.DataFrame({
576
+ 'text': train['news'][:1500].replace(r'\n', ' ', regex=True),
577
+ 'label': train['label'][:1500]
578
+ })
579
+
580
+ eval_df = pd.DataFrame({
581
+ 'text': eva['news'][-400:].replace(r'\n', ' ', regex=True),
582
+ 'label': eva['label'][-400:]
583
+ })
584
+
585
+ model.train_model(train_df)
586
+
587
+ result, model_outputs, wrong_predictions = model.eval_model(eval_df)
588
+
589
+ result
590
+
591
+ model_outputs
592
+
593
+ len(wrong_predictions)
594
+
595
+ lst = []
596
+ for arr in model_outputs:
597
+ lst.append(np.argmax(arr))
598
+
599
+ true = eval_df['label'].tolist()
600
+ predicted = lst
601
+
602
+ import sklearn
603
+ mat = sklearn.metrics.confusion_matrix(true , predicted)
604
+ mat
605
+
606
+ df_cm = pd.DataFrame(mat, range(3), range(3))
607
+
608
+ sns.heatmap(df_cm, annot=True)
609
+ plt.show()
610
+
611
+ print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative']))
612
+
613
+ sklearn.metrics.accuracy_score(true,predicted)
614
+
615
+ #Give your statement
616
+ def get_result(statement):
617
+ result = model.predict([statement])
618
+ pos = np.where(result[1][0] == np.amax(result[1][0]))
619
+ pos = int(pos[0])
620
+ sentiment_dict = {0:'positive',1:'negative',2:'neutral'}
621
+ print(sentiment_dict[pos])
622
+ return
623
+
624
+ ## neutral statement
625
+ get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .")
626
+
627
+ ## positive statement
628
+ get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .")
629
+
630
+ ## negative statement
631
+ get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .')
632
+
633
+ get_result("This company is growing like anything with 23% profit every year")
634
+
635
+ get_result("This company is not able to make any profit but make very less profit in last quarter")
636
+
637
+ get_result("The doctor treated well and the patient was very healthy")
638
+
639
+ get_result("the act of politicians is to serve and help needy and not to create ruck suck")
640
+
641
+ get_result("American burger is too good. Can't resisit to go and have one")
642
+
643
+ get_result("GDP per capita increased to double in India from 2013")
644
+
645
+ get_result("Indian economy is doing very good and will become super power one day.")
646
+
647
+ get_result("Indian economy is doing very good and will create millions of jobs in coming years")
648
+
649
+ get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years")
650
+
651
+ get_result("Indian economy is doing very good.Indian economy is not doing very good ")
652
+
653
+ get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy")
654
+
655
+ get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export")
656
+
657
+ get_result("The stock market of Indian economy is dangling too much")
658
+
659
+ """#VADER"""
660
+
661
+ !pip install vaderSentiment
662
+
663
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
664
+
665
+ obj = SentimentIntensityAnalyzer()
666
+
667
+ sentence = "Ram is really good "
668
+ sentiment_dict = obj.polarity_scores(sentence)
669
+ print(sentiment_dict)
670
+
671
+ #check this
672
+ sentence = "Ram is better "
673
+ sentiment_dict = obj.polarity_scores(sentence)
674
+ print(sentiment_dict)
675
+
676
+ sentence = "Rahul is really bad"
677
+ sentiment_dict = obj.polarity_scores(sentence)
678
+ print(sentiment_dict)
679
+
680
+ #punctuation
681
+ print(obj.polarity_scores('Ram is good boy'))
682
+ print(obj.polarity_scores('Ram is good boy!'))
683
+ print(obj.polarity_scores('Ram is good boy!!'))
684
+
685
+ #capitalization
686
+ print(obj.polarity_scores('Ram is good'))
687
+ print(obj.polarity_scores('Ram is GOOD'))
688
+
689
+ #degree
690
+ print(obj.polarity_scores('Ram is good'))
691
+ print(obj.polarity_scores('Ram is better'))
692
+ print(obj.polarity_scores('Ram is best'))
693
+
694
+ print(obj.polarity_scores('Ram is bad'))
695
+ print(obj.polarity_scores('Ram is worse'))
696
+ print(obj.polarity_scores('Ram is worst'))
697
+
698
+ #conjuction
699
+ print(obj.polarity_scores('Ram is good'))
700
+ print(obj.polarity_scores('Ram is good, but he is also naughty sometimes'))
701
+
702
+ #slang
703
+ print(obj.polarity_scores("That Hotel"))
704
+ print(obj.polarity_scores("That Hotel SUX"))
705
+ print(obj.polarity_scores("That Hotel SUCKS"))
706
+
707
+ #emoticons
708
+ print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen"))
709
+ print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen"))
710
+
711
+ print(obj.polarity_scores("Your :( is the worst thing I have ever seen"))
712
+ print(obj.polarity_scores("Your smile is the worst thing I have ever seen"))
713
+
714
+ #https://360digitmg.com/blog/bert-variants-and-their-differences
715
+ #https://simpletransformers.ai/docs/classification-specifics/#supported-model-types Official reference
716
+
717
+ """#3.a Using FINBERT Model"""
718
+
719
+ #PPT
720
+ #https://medium.com/@benjamin_joesy/finbert-financial-sentiment-analysis-with-bert-acf695b64ac6
721
+
722
+ from transformers import BertTokenizer, BertForSequenceClassification, pipeline
723
+
724
+ # tested in transformers==4.18.0
725
+ import transformers
726
+ transformers.__version__
727
+
728
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
729
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
730
+
731
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
732
+ results = nlp(['growth is strong and we have plenty of liquidity.',
733
+ 'there is a shortage of capital, and we need extra financing.',
734
+ 'formulation patents might protect Vasotec to a limited extent.'])
735
+
736
+ results
737
+
738
+ """#FINBERT ESG"""
739
+
740
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
741
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
742
+
743
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
744
+ results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.',
745
+ 'Rhonda has been volunteering for several years for a variety of charitable community programs.',
746
+ 'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.',
747
+ 'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.'])
748
+
749
+ results
750
+
751
+ """#FINBERT Classification"""
752
+
753
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
754
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
755
+
756
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
757
+ results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.',
758
+ 'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.',
759
+ 'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in'])
760
+
761
+ results
762
+
763
+ X = df['Review Text'].to_list()
764
+ y = df['sentiment'].to_list()
765
+
766
+ from transformers import BertTokenizer, BertForSequenceClassification
767
+
768
+ finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
769
+ tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
770
+
771
+ labels = {0:'neutral', 1:'positive',2:'negative'}
772
+
773
+ sent_val = list()
774
+ for x in X:
775
+ inputs = tokenizer_whole(x, return_tensors="pt", padding=True)
776
+ outputs = finbert_whole(**inputs)[0]
777
+
778
+ val = labels[np.argmax(outputs.detach().numpy())]
779
+ print(x, '---->', val)
780
+ print('#######################################################')
781
+ sent_val.append(val)
782
+
783
+ from sklearn.metrics import accuracy_score
784
+ print(accuracy_score(y, sent_val))
785
+
786
+ """#Using DISTILBERT"""
787
+
788
+ from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
789
+
790
+ tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
791
+ model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
792
+
793
+ labels = {0:'neutral', 1:'positive',2:'negative'}
794
+
795
+ sent_val_bert = list()
796
+ for x in X:
797
+ inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True)
798
+ outputs = model_distilbert(**inputs)[0]
799
+
800
+ val = labels[np.argmax(outputs.detach().numpy())]
801
+ print(x, '---->', val)
802
+ print('#######################################################')
803
+ sent_val_bert.append(val)
804
+
805
+ from sklearn.metrics import accuracy_score
806
+ print(accuracy_score(y, sent_val))
807
+
808
+ """#Bert"""
809
+
810
+ tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased")
811
+ model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased")
812
+
813
+ labels = {0:'neutral', 1:'positive',2:'negative'}
814
+
815
+ sent_val_bert1 = list()
816
+ for x in X:
817
+ inputs = tokenizer_bert(x, return_tensors="pt", padding=True)
818
+ outputs = model_bert(**inputs)[0]
819
+
820
+ val = labels[np.argmax(outputs.detach().numpy())]
821
+ print(x, '---->', val)
822
+ print('#######################################################')
823
+ sent_val_bert1.append(val)
824
+
825
+ from sklearn.metrics import accuracy_score
826
+ print(accuracy_score(y, sent_val))
gen-data.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
Binary file (8.87 kB). View file