greco commited on
Commit
0a6fa53
·
1 Parent(s): 3cb656a

Revert "update codes"

Browse files

This reverts commit 52198899b03ef3c76baa12146db06d229a505b9e.

app.py CHANGED
@@ -18,8 +18,6 @@ from scipy.stats import zscore
18
 
19
  # nlp
20
  from bertopic import BERTopic
21
- from transformers import pipeline
22
- import transformers
23
 
24
  # custom
25
  import survey_analytics_library as LIB
@@ -63,14 +61,6 @@ def read_topic_results():
63
  return topic_results
64
  topic_results = read_topic_results()
65
 
66
- @st.cache
67
- def read_climate_change_results():
68
- sentiment_results = pd.read_csv(data_path+'sentiment_results.csv')
69
- zero_shot_results = pd.read_csv(data_path+'zero_shot_results.csv')
70
- return sentiment_results, zero_shot_results
71
- sentiment_results, zero_shot_results = read_climate_change_results()
72
-
73
-
74
  # write title of app
75
  st.title('DACoP - Survey Analytics')
76
  st.markdown('''---''')
@@ -376,184 +366,9 @@ st.markdown('''---''')
376
 
377
 
378
  st.header('Classifiying Text Responses and Sentiment Analysis')
379
- st.write(f'''
380
  With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
381
- An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
382
- Using **Zero-shot Classification**, we can classify responses into one of these four categories.
383
- As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
384
- We'll use a different set of 10,000 tweets related to climate change.
385
- ''')
386
- st.write('\n')
387
-
388
- # rename column
389
- sentiment_results = sentiment_results.rename(columns={'sequence':'Tweet'})
390
- st.dataframe(sentiment_results[['Tweet']])
391
-
392
- @st.cache(allow_output_mutation=True)
393
- def load_transfomer_pipelines():
394
- classifier_zero_shot = pipeline(
395
- task='zero-shot-classification',
396
- model=model_path+'distilbart-mnli-12-1',
397
- return_all_scores=True
398
- )
399
- classifier_sentiment = pipeline(
400
- task='sentiment-analysis',
401
- model=model_path+'distilbert-base-uncased-finetuned-sst-2-english',
402
- return_all_scores=True
403
- )
404
- return classifier_zero_shot, classifier_sentiment
405
- classifier_zero_shot, classifier_sentiment = load_transfomer_pipelines()
406
-
407
- # define candidate labels
408
- candidate_labels = [
409
- 'finance',
410
- 'politics',
411
- 'technology',
412
- 'wildlife',
413
- ]
414
-
415
- # define sample tweet
416
- sample_tweet_index = 5000
417
-
418
- # define the first and last topic number
419
- # create range of index
420
- tweet_index = sentiment_results.index
421
- first_tweet = tweet_index[0]
422
- last_tweet = tweet_index[-1]
423
 
424
- st.write(f'''
425
- As a demonstration, we'll define some categories and pick a tweet to classify and determine its sentiment.
426
- Feel free to add your own categories or even input your own text!
427
  ''')
428
-
429
- # interactive input for user to define candidate labels and tweet index for analysis
430
- with st.form('classify_tweets'):
431
- # input for labels
432
- user_defined_labels = st.text_input('Enter categories (separate categories by comma):', ', '.join(candidate_labels))
433
- candidate_labels = user_defined_labels
434
- # input for tweet index
435
- user_define_tweet = st.number_input(f'Enter tweet index (from {first_tweet} to {last_tweet}) to classify:', min_value=first_tweet, max_value=last_tweet, value=sample_tweet_index)
436
- sample_tweet_index = user_define_tweet
437
- sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
438
- # input for user defined text
439
- user_defined_input = st.text_input('Enter custom text (optional, leave blank to use Tweets):', '')
440
- # check if user has entered any custom text
441
- # if user_define_input is not blank, then override sample_tweet
442
- if user_defined_input:
443
- sample_tweet = user_defined_input
444
-
445
- # submit form
446
- submit = st.form_submit_button('Classify Tweet')
447
-
448
  st.write('\n')
449
- st.write(f'''
450
- Here are the results:
451
- ''')
452
- st.write(f'Input Text: *\'{sample_tweet}\'*')
453
-
454
- # get predictions from models
455
- zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
456
- sentiment_sample = classifier_sentiment(sample_tweet)
457
-
458
- # get sentiment
459
- sentiment_sample = sentiment_sample[1].get('score')
460
- sentiment_label = 'positive'
461
- if sentiment_sample < 0.5:
462
- sentiment_label = 'negative'
463
-
464
- st.write(f'''
465
- The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
466
- Main category score ranges from 0 to 1, with 1 being very likely.
467
-
468
- The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
469
- Full set of scores cores add up to 1.
470
-
471
- The sentiment is: **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
472
- Sentiment score ranges from 0 to 1, with 1 being very positive.
473
- ''')
474
- st.write('\n')
475
- st.write('\n')
476
-
477
- # drop unused columns and rename columns
478
- zero_shot_results = zero_shot_results.drop('labels_scores', axis=1)
479
- zero_shot_results = zero_shot_results.rename(columns={'sequence':'tweet', 'label':'category'})
480
- st.write(f'''
481
- Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
482
- ''')
483
-
484
- st.dataframe(zero_shot_results)
485
-
486
- st.write(f'''
487
- We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
488
- It is likely that the tweet does not natually fall into one of the defined categories.
489
- Before performing further analysis on our results, we can set a score threshold to only keep predictions that we're confident in.
490
- ''')
491
- st.write('\n')
492
-
493
- # interactive input for user to define candidate labels and tweet index for analysis
494
- with st.form('classification_score_threshold'):
495
- user_defined_threshold = st.number_input('Enter score threshold (between 0.01 and 0.99):', min_value=0.01, max_value=0.99, value=0.7, step=0.05)
496
- # submit form
497
- submit = st.form_submit_button('Set Threshold')
498
- st.write('\n')
499
-
500
- # filter and keep results with score above defined threshold
501
- zero_shot_results_clean = zero_shot_results.loc[(zero_shot_results['score'] >= user_defined_threshold)].copy()
502
-
503
- # rename columns
504
- sentiment_results.columns = ['tweet', 'sentiment']
505
-
506
- st.write(f'''
507
- The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
508
- Out of the 10,000 tweets, we are now left with {len(zero_shot_results_clean)}.
509
- We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
510
- ''')
511
-
512
- # merge in sentiment score on index
513
- # drop unused columns
514
- classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
515
- classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
516
- st.dataframe(classification_sentiment_df)
517
-
518
- st.write(f'''
519
- The difficult part for zero-shot classification is defining the right set of categories for each business case.
520
- Some trial and error is required to find the appropriate words that can return the optimal results.
521
- ''')
522
- st.write('\n')
523
-
524
- # group by category, count tweets and get mean of sentiment
525
- classification_sentiment_agg = classification_sentiment_df.groupby(['category']).agg({'tweet':'count', 'sentiment':'mean'}).reset_index()
526
- classification_sentiment_agg = classification_sentiment_agg.rename(columns={'tweet':'count'})
527
-
528
- st.write(f'''
529
- Finally, we can visualise the percentage of tweets in each category and the respective average sentiment scores.
530
- ''')
531
-
532
- fig = px.pie(
533
- classification_sentiment_agg,
534
- values='count',
535
- names='category',
536
- hole=0.35,
537
- title='Percentage of Tweets in Each Category',
538
- template='simple_white',
539
- width=1000,
540
- height=600
541
- )
542
- fig.update_traces(textposition='inside', textinfo='percent+label')
543
- st.plotly_chart(fig)
544
-
545
- fig = px.bar(
546
- classification_sentiment_agg,
547
- x='category',
548
- y='sentiment',
549
- title='Average Sentiment of Tweets in Each Category <br><sup>Overall, the sentiment of the tweets are on the negative side.</sup>',
550
- template='simple_white',
551
- width=1000,
552
- height=600
553
- )
554
- fig.update_yaxes(range=[0, 1])
555
- fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
556
- st.plotly_chart(fig)
557
-
558
- st.write('\n')
559
- st.markdown('''---''')
 
18
 
19
  # nlp
20
  from bertopic import BERTopic
 
 
21
 
22
  # custom
23
  import survey_analytics_library as LIB
 
61
  return topic_results
62
  topic_results = read_topic_results()
63
 
 
 
 
 
 
 
 
 
64
  # write title of app
65
  st.title('DACoP - Survey Analytics')
66
  st.markdown('''---''')
 
366
 
367
 
368
  st.header('Classifiying Text Responses and Sentiment Analysis')
369
+ st.write('''
370
  With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
371
+ E.g.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
 
 
 
373
  ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  st.write('\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/climate_change_tweets.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/imdb.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/sentiment_results.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/zero_shot_results.csv DELETED
The diff for this file is too large to render. See raw diff
 
survey_analytics_library.py CHANGED
@@ -18,6 +18,126 @@ from nltk.corpus import stopwords
18
 
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # replace text with multiple replacements
22
  def replace_text(string, dict_of_replacements):
23
  '''
@@ -259,41 +379,5 @@ def convert_zero_shot_classification_output_to_dataframe(model_output):
259
  # drop unused columns
260
  results = results.drop(['labels', 'scores'], axis=1)
261
 
262
- # return
263
- return results
264
-
265
-
266
- # convert transformer model sentiment classification prediction into dataframe
267
- def convert_sentiment_classification_output_to_dataframe(text_input, model_output):
268
- '''
269
- convert sentiment classification output into a dataframe
270
-
271
- the model used distilbert-base-uncased-finetuned-sst-2-english outputs a list of lists with two dictionaries,
272
- within each dictionary is a label negative or postive and the respective score
273
- [
274
- [
275
- {'label': 'NEGATIVE', 'score': 0.18449656665325165},
276
- {'label': 'POSITIVE', 'score': 0.8155034780502319}
277
- ],
278
- ...
279
- ]
280
- the scores sum up to 1, and we extract only the positive score in this function,
281
- append the scores to the model's input and return a dataframe
282
-
283
- arguments:
284
- text_input (list): a list of sequences that is input for the model
285
- model_output (list): a list of labels and scores
286
-
287
- return:
288
- a dataframe of sequences and sentiment score
289
-
290
- '''
291
- # store model positive scores as dataframe
292
- results = pd.DataFrame(model_output)[[1]]
293
- # get score from column
294
- results = results[1].apply(lambda x: x.get('score'))
295
- # store input sequences and scores as dataframe
296
- results = pd.DataFrame({'sequence':text_input, 'score':results})
297
-
298
  # return
299
  return results
 
18
 
19
 
20
 
21
+ # # create elbow plot with kmeans to find optimal number of clusters
22
+ # def create_elbow_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
23
+ # '''
24
+ # create elbow plot with kmeans to find optimal number of clusters based on inertia
25
+ # where the clusters strikes a balance between being not segmented enough and being too fragmented
26
+
27
+ # we look for the point of diminishing returns (also known as the 'elbow') in terms of the inertia,
28
+ # where inertia is how close the data points are to their respective centers or centroids
29
+
30
+ # arguments:
31
+ # df (df): a dataframe of data to cluster
32
+ # num_clusters (int): number of clusters to plot
33
+ # init_method (str): default to 'k-means++', other option is 'random'
34
+ # n_init (int): default to 10, number of times to run model, cost from the best run will be used
35
+ # random_state (int): default to 42, random seed used to initialise the model
36
+ # plot (bool): default to True, option to turn off plots
37
+ # template (str): default to 'simple_white', change as desired
38
+ # save (bool): default to False, if True save plot as .html file
39
+
40
+ # returns:
41
+ # a list of inertia for each run
42
+ # '''
43
+
44
+ # # create empty list to store inertia for each run
45
+ # inertia = []
46
+ # # define range of clusters to try
47
+ # k = range(2, num_clusters+1)
48
+
49
+ # # loop through number of clusters
50
+ # for num_clusters in tqdm(k):
51
+ # # define model
52
+ # kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
53
+ # # fit and predict data
54
+ # kmeans.fit_predict(df)
55
+ # # get predicted labels
56
+ # predicted_labels = kmeans.labels_
57
+ # # append score to list of scores
58
+ # inertia.append(kmeans.inertia_)
59
+
60
+ # # plot elbow plot
61
+ # if plot:
62
+ # fig = px.line(
63
+ # pd.DataFrame({'num_clusters':list(k), 'inertia':inertia}),
64
+ # x='num_clusters',
65
+ # y='inertia',
66
+ # title='Elbow Plot for Optimal Number of Clusters with '+init_method,
67
+ # markers=True,
68
+ # template=template,
69
+ # width=800,
70
+ # height=500,
71
+ # )
72
+ # st.plotly_chart(fig, use_container_width=True)
73
+ # if save:
74
+ # fig.write_html('Elbow Plot for Optimal Number of Clusters with '+init_method+'.html')
75
+
76
+ # # return
77
+ # return inertia
78
+
79
+
80
+
81
+ # # create plot of silhouette scores with sklearn model to find optimal number of clusters
82
+ # def silhouette_score_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
83
+ # '''
84
+ # create plot of silhouette score with kmeans to find optimal number of clusters
85
+ # where the clusters strikes a balance between being not segmented enough and being too fragmented
86
+ # the closer the score is to 1, the more easily distinguishable are the clusters from each other
87
+
88
+ # arguments:
89
+ # df (df): a dataframe of data to cluster
90
+ # num_clusters (int): number of clusters to plot
91
+ # init_method (str): default to 'k-means++', other option is 'random'
92
+ # n_init (int): default to 10, number of times to run model, cost from the best run will be used
93
+ # random_state (int): default to 42, random seed used to initialise the model
94
+ # plot (bool): default to True, option to turn off plots
95
+ # template (str): default to 'simple_white', change as desired
96
+ # save (bool): default to False, if True save plot as .html file
97
+
98
+ # returns:
99
+ # a list of silhouette scores for each run
100
+ # '''
101
+
102
+ # # create empty list to store silhoutte scores for each run
103
+ # silhouette_scores = []
104
+ # # define range of clusters to try
105
+ # k = range(2, num_clusters+1)
106
+
107
+ # # loop through number of clusters
108
+ # for num_clusters in tqdm(k):
109
+ # # define model
110
+ # kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
111
+ # # fit and predict data
112
+ # kmeans.fit_predict(df)
113
+ # # get predicted labels
114
+ # predicted_labels = kmeans.labels_
115
+ # # get silhoutte score
116
+ # score = silhouette_score(df, predicted_labels)
117
+ # # append score to list of scores
118
+ # silhouette_scores.append(score)
119
+
120
+ # # plot silhouette scores
121
+ # if plot:
122
+ # fig = px.line(
123
+ # pd.DataFrame({'num_clusters':list(k), 'silhouette_scores':silhouette_scores}),
124
+ # x='num_clusters',
125
+ # y='silhouette_scores',
126
+ # title='Silhouette Scores for Optimal Number of Clusters with '+init_method,
127
+ # markers=True,
128
+ # template=template,
129
+ # width=800,
130
+ # height=500,
131
+ # )
132
+ # st.plotly_chart(fig, use_container_width=True)
133
+ # if save:
134
+ # fig.write_html('Silhouette Scores for Optimal Number of Clusters with '+init_method+'.html')
135
+
136
+ # # return
137
+ # return silhouette_scores
138
+
139
+
140
+
141
  # replace text with multiple replacements
142
  def replace_text(string, dict_of_replacements):
143
  '''
 
379
  # drop unused columns
380
  results = results.drop(['labels', 'scores'], axis=1)
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  # return
383
  return results