greco commited on
Commit
936e064
·
1 Parent(s): c6af1a7

update explainations

Browse files
Files changed (1) hide show
  1. app.py +44 -23
app.py CHANGED
@@ -22,7 +22,10 @@ from transformers import pipeline
22
  # custom
23
  import survey_analytics_library as LIB
24
 
25
- # st.set_page_config(layout='wide')
 
 
 
26
 
27
  # define data file path
28
  data_path = 'data' + os.sep
@@ -85,7 +88,7 @@ df_factor_analysis = data_survey.copy()
85
  st.subheader('Sample Survey Data')
86
  st.write('''
87
  Here we have a sample survey dataset where responders answer questions about their personality traits on a scale from 1 (Very Inaccurate) to 6 (Very Accurate).
88
- Factor Analysis gives us \'factors\' or groups of responders into groups can provide us insights about the different personalities of the responders.
89
  ''')
90
 
91
  # split page into two columns
@@ -103,14 +106,20 @@ st.write('\n')
103
  st.subheader('Factor Analysis Suitability')
104
  st.write('''
105
  Before performing Factor Analysis on the data, we need to evaluate if it is suitable to do so.
106
- We apply two statistical tests (Bartlett's and KMO test) the data.
 
 
107
  ''')
108
 
109
  # interactive button to run statistical test to determine suitability for factor analysis
110
  if st.button('Run Tests'):
111
- # test with the null hypothesis that the correlation matrix is an identity matrix
 
 
112
  bartlett_sphericity_stat, p_value = calculate_bartlett_sphericity(x=df_factor_analysis)
113
- # test how predictable of a variable by others
 
 
114
  kmo_per_variable, kmo_total = calculate_kmo(x=df_factor_analysis)
115
  # print test results
116
  st.write(f'''
@@ -143,8 +152,9 @@ scree_df = pd.DataFrame({'Eigenvalues':eigenvalues, 'Number of Factors':list(ran
143
  st.subheader('Number of Clusters?')
144
  st.write(f'''
145
  How many clusters or factors are appropriate for our data?
146
- For Factor Analysis, we can determine the number of factors using the Kaiser criterion and a Scree Plot.
147
- We should include factors with an Eigenvalue of at least 1.0.
 
148
  ''')
149
 
150
  # plot scree plot
@@ -194,7 +204,7 @@ responder_factors['cluster'] = responder_factors.apply(lambda s: s.argmax(), axi
194
 
195
  # define list of factor columns
196
  list_of_factor_cols = [col for col in responder_factors.columns if 'factor_' in col]
197
- st.subheader('Fator Analysis Results')
198
  st.write('''
199
  Factor analysis gives us a loading for every factor for each responder.
200
  We assign each responder to a factor or cluster based on their maximum loading across all the factors.
@@ -218,7 +228,7 @@ fa_z_scores = fa_z_scores.apply(lambda x: round(x, 2))
218
 
219
  st.write('''
220
  Aggregating the scores of the clusters gives us detail insights to the personality traits of the responders.
221
- The scores here have been normalised to Z-scores, a measure of how many standard deviations (SD) is the score away from the mean.
222
  E.g. A Z-score of 0 indicates the score is identical to the mean, while a Z-score of 1 indicates the score is 1 SD away from the mean.
223
  ''')
224
  # define colour map for highlighting cells
@@ -308,7 +318,7 @@ st.plotly_chart(fig, use_container_width=True)
308
 
309
  st.write('''
310
  Now we can see that the topics have improved.
311
- We can make use of the top words in each topic to come up with a meaningful name.
312
  ''')
313
  st.write('\n')
314
  st.write('\n')
@@ -368,7 +378,6 @@ st.write(f'''
368
  An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
369
  Using **Zero-shot Classification**, we can classify responses into one of these four categories.
370
  As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
371
- We'll use a different set of {len(sentiment_results):,} tweets related to climate change.
372
  ''')
373
  st.write('\n')
374
 
@@ -429,20 +438,24 @@ with st.form('classify_tweets'):
429
  sample_tweet_index = user_define_tweet
430
  sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
431
  # input for user defined text
432
- user_defined_input = st.text_input('Enter custom text (optional, leave blank to use Tweets):', '')
433
  # check if user has entered any custom text
434
  # if user_define_input is not blank, then override sample_tweet
435
  if user_defined_input:
436
  sample_tweet = user_defined_input
437
 
438
  # submit form
439
- submit = st.form_submit_button('Classify Tweet')
440
  st.write('\n')
441
 
442
  st.write(f'''
443
  Here are the results:
444
  ''')
445
- st.write(f'Input Text: *\'{sample_tweet}\'*')
 
 
 
 
446
 
447
  # get predictions from models
448
  zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
@@ -454,6 +467,10 @@ sentiment_label = 'positive'
454
  if sentiment_sample < 0.5:
455
  sentiment_label = 'negative'
456
 
 
 
 
 
457
  st.write(f'''
458
  The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
459
  Main category score ranges from 0 to 1, with 1 being very likely.
@@ -461,7 +478,7 @@ st.write(f'''
461
  The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
462
  Full set of scores cores add up to 1.
463
 
464
- The sentiment is: **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
465
  Sentiment score ranges from 0 to 1, with 1 being very positive.
466
  ''')
467
  st.write('\n')
@@ -476,7 +493,7 @@ st.write(f'''
476
  Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
477
  ''')
478
 
479
- st.dataframe(zero_shot_results)
480
 
481
  st.write(f'''
482
  We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
@@ -487,7 +504,7 @@ st.write('\n')
487
 
488
  # interactive input for user to define candidate labels and tweet index for analysis
489
  with st.form('classification_score_threshold'):
490
- user_defined_threshold = st.number_input('Enter score threshold (between 0.01 and 0.99):', min_value=0.01, max_value=0.99, value=0.7, step=0.05)
491
  # submit form
492
  submit = st.form_submit_button('Set Threshold')
493
  st.write('\n')
@@ -500,7 +517,7 @@ sentiment_results.columns = ['tweet', 'sentiment']
500
 
501
  st.write(f'''
502
  The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
503
- Out of the 10,000 tweets, we are now left with {len(zero_shot_results_clean)}.
504
  We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
505
  ''')
506
 
@@ -508,7 +525,11 @@ st.write(f'''
508
  # drop unused columns
509
  classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
510
  classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
511
- st.dataframe(classification_sentiment_df)
 
 
 
 
512
 
513
  st.write(f'''
514
  The difficult part for zero-shot classification is defining the right set of categories for each business case.
@@ -535,20 +556,20 @@ fig = px.pie(
535
  height=600
536
  )
537
  fig.update_traces(textposition='inside', textinfo='percent+label')
538
- st.plotly_chart(fig)
539
 
540
  fig = px.bar(
541
  classification_sentiment_agg,
542
  x='category',
543
  y='sentiment',
544
- title='Average Sentiment of Tweets in Each Category <br><sup>Overall, the sentiment of the tweets are on the negative side.</sup>',
545
  template='simple_white',
546
  width=1000,
547
  height=600
548
  )
549
  fig.update_yaxes(range=[0, 1])
550
  fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
551
- st.plotly_chart(fig)
552
 
553
  st.write('\n')
554
- st.markdown('''---''')
 
22
  # custom
23
  import survey_analytics_library as LIB
24
 
25
+ st.set_page_config(
26
+ page_title='Survey Analytics',
27
+ layout='centered',
28
+ )
29
 
30
  # define data file path
31
  data_path = 'data' + os.sep
 
88
  st.subheader('Sample Survey Data')
89
  st.write('''
90
  Here we have a sample survey dataset where responders answer questions about their personality traits on a scale from 1 (Very Inaccurate) to 6 (Very Accurate).
91
+ Factor Analysis gives us \'factors\' or clusters of responders which provide us insights about the different personalities of the responders.
92
  ''')
93
 
94
  # split page into two columns
 
106
  st.subheader('Factor Analysis Suitability')
107
  st.write('''
108
  Before performing Factor Analysis on the data, we need to evaluate if it is suitable to do so.
109
+ We apply two statistical tests (Bartlett's and KMO test) the data.
110
+ These two tests check if the variables in the data are correlated with each other.
111
+ If there isn't any correlation between the variables, then the data is unsuitable for factor analysis as there are no natural clusters.
112
  ''')
113
 
114
  # interactive button to run statistical test to determine suitability for factor analysis
115
  if st.button('Run Tests'):
116
+ # test if the data is an identity matrix
117
+ # an identify matrix is when the variables in the data are uncorrelated to other variables
118
+ # this means that the data is unsuitable for factor analysis as there are no natural clusters
119
  bartlett_sphericity_stat, p_value = calculate_bartlett_sphericity(x=df_factor_analysis)
120
+ # test how predictable is a variable by variables in the data
121
+ # if variables are unpredictable or uncorrelated
122
+ # this means that the data is unsuitable for factor analysis as there are no natural clusters
123
  kmo_per_variable, kmo_total = calculate_kmo(x=df_factor_analysis)
124
  # print test results
125
  st.write(f'''
 
152
  st.subheader('Number of Clusters?')
153
  st.write(f'''
154
  How many clusters or factors are appropriate for our data?
155
+ For Factor Analysis, we can determine the number of factors using the eigenvalues and a scree plot.
156
+ E.g. A factor with an eigenvalue of 5 means that we can represent 5 variables from the data with just 1 factor.
157
+ The Kaiser criterion suggests that we should include factors with an eigenvalue of at least 1, so the factors included should at least represent 1 variable.
158
  ''')
159
 
160
  # plot scree plot
 
204
 
205
  # define list of factor columns
206
  list_of_factor_cols = [col for col in responder_factors.columns if 'factor_' in col]
207
+ st.subheader('Factor Analysis Results')
208
  st.write('''
209
  Factor analysis gives us a loading for every factor for each responder.
210
  We assign each responder to a factor or cluster based on their maximum loading across all the factors.
 
228
 
229
  st.write('''
230
  Aggregating the scores of the clusters gives us detail insights to the personality traits of the responders.
231
+ The scores here have been normalised to Z-scores, which is a measure of how many standard deviations (SD) is the score away from the mean.
232
  E.g. A Z-score of 0 indicates the score is identical to the mean, while a Z-score of 1 indicates the score is 1 SD away from the mean.
233
  ''')
234
  # define colour map for highlighting cells
 
318
 
319
  st.write('''
320
  Now we can see that the topics have improved.
321
+ We can make use of the top words in each topic to come up with a meaningful name, this has to be done manually and is subjective.
322
  ''')
323
  st.write('\n')
324
  st.write('\n')
 
378
  An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
379
  Using **Zero-shot Classification**, we can classify responses into one of these four categories.
380
  As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
 
381
  ''')
382
  st.write('\n')
383
 
 
438
  sample_tweet_index = user_define_tweet
439
  sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
440
  # input for user defined text
441
+ user_defined_input = st.text_input('Enter custom text (optional, leave blank to use tweets):', '')
442
  # check if user has entered any custom text
443
  # if user_define_input is not blank, then override sample_tweet
444
  if user_defined_input:
445
  sample_tweet = user_defined_input
446
 
447
  # submit form
448
+ submit = st.form_submit_button('Classify Text')
449
  st.write('\n')
450
 
451
  st.write(f'''
452
  Here are the results:
453
  ''')
454
+
455
+ if user_defined_input:
456
+ st.write(f'Custom Text: *\'{sample_tweet}\'*')
457
+ else:
458
+ st.write(f'Selected Tweet: *\'{sample_tweet}\'*')
459
 
460
  # get predictions from models
461
  zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
 
467
  if sentiment_sample < 0.5:
468
  sentiment_label = 'negative'
469
 
470
+ emoji = {
471
+ 'positive':'😀',
472
+ 'negative':'☹️',
473
+ }
474
  st.write(f'''
475
  The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
476
  Main category score ranges from 0 to 1, with 1 being very likely.
 
478
  The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
479
  Full set of scores cores add up to 1.
480
 
481
+ The sentiment is: {emoji[sentiment_label]} **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
482
  Sentiment score ranges from 0 to 1, with 1 being very positive.
483
  ''')
484
  st.write('\n')
 
493
  Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
494
  ''')
495
 
496
+ st.dataframe(zero_shot_results.style.highlight_max(axis=1, subset=['finance', 'politics', 'technology', 'wildlife'], props='color:white; background-color:green;').format(precision=2))
497
 
498
  st.write(f'''
499
  We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
 
504
 
505
  # interactive input for user to define candidate labels and tweet index for analysis
506
  with st.form('classification_score_threshold'):
507
+ user_defined_threshold = st.number_input('Enter score threshold (between 0 and 1):', min_value=0.0, max_value=1.0, value=0.7, step=0.05)
508
  # submit form
509
  submit = st.form_submit_button('Set Threshold')
510
  st.write('\n')
 
517
 
518
  st.write(f'''
519
  The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
520
+ Out of the {len(sentiment_results):,} tweets, we are now left with {len(zero_shot_results_clean)}.
521
  We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
522
  ''')
523
 
 
525
  # drop unused columns
526
  classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
527
  classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
528
+
529
+ def highlight_sentiment(value):
530
+ color = 'green' if value >= 0.5 else 'red'
531
+ return 'color:{}'.format(color)
532
+ st.dataframe(classification_sentiment_df.style.applymap(highlight_sentiment, subset=['sentiment']).format(precision=2))
533
 
534
  st.write(f'''
535
  The difficult part for zero-shot classification is defining the right set of categories for each business case.
 
556
  height=600
557
  )
558
  fig.update_traces(textposition='inside', textinfo='percent+label')
559
+ st.plotly_chart(fig, use_container_width=True)
560
 
561
  fig = px.bar(
562
  classification_sentiment_agg,
563
  x='category',
564
  y='sentiment',
565
+ title='Average Sentiment of Tweets in Each Category',
566
  template='simple_white',
567
  width=1000,
568
  height=600
569
  )
570
  fig.update_yaxes(range=[0, 1])
571
  fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
572
+ st.plotly_chart(fig, use_container_width=True)
573
 
574
  st.write('\n')
575
+ st.markdown('''---''')