greco commited on
Commit
56cad86
·
1 Parent(s): 0a6fa53
.gitattributes CHANGED
@@ -25,8 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
- models/bertopic_model_tokyo_olympics_tweets filter=lfs diff=lfs merge=lfs -text
29
- models/bertopic_model_tokyo_olympics_tweets_unclean filter=lfs diff=lfs merge=lfs -text
30
- models/distilbart-mnli-12-1/flax_model.msgpack filter=lfs diff=lfs merge=lfs -text
31
- models/distilbart-mnli-12-1/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
32
- models/distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ models/bertopic_model_tokyo_olympics_tweets filter=lfs diff=lfs merge=lfs -text
 
 
 
 
app.py CHANGED
@@ -9,6 +9,7 @@ import os
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
  import plotly.express as px
 
12
 
13
  # factor analysis
14
  from factor_analyzer import FactorAnalyzer
@@ -18,6 +19,7 @@ from scipy.stats import zscore
18
 
19
  # nlp
20
  from bertopic import BERTopic
 
21
 
22
  # custom
23
  import survey_analytics_library as LIB
@@ -38,16 +40,10 @@ def read_survey_data():
38
  data_survey, data_questions = read_survey_data()
39
 
40
  @st.cache
41
- def read_tweet_data():
42
  tokyo = pd.read_csv(data_path+'tokyo_olympics_tweets.csv')
43
  return tokyo
44
- tokyo = read_tweet_data()
45
-
46
- @st.cache(allow_output_mutation=True)
47
- def load_bertopic_model_unclean():
48
- topic_model = BERTopic.load(model_path+'bertopic_model_tokyo_olympics_tweets_unclean')
49
- return topic_model
50
- topic_model_unclean = load_bertopic_model_unclean()
51
 
52
  @st.cache(allow_output_mutation=True)
53
  def load_bertopic_model():
@@ -61,6 +57,14 @@ def read_topic_results():
61
  return topic_results
62
  topic_results = read_topic_results()
63
 
 
 
 
 
 
 
 
 
64
  # write title of app
65
  st.title('DACoP - Survey Analytics')
66
  st.markdown('''---''')
@@ -272,14 +276,9 @@ st.write('''
272
  ''')
273
  st.write('\n')
274
 
275
- # plot topics using unclean data
276
- fig = LIB.visualize_barchart_titles(
277
- topic_model=topic_model_unclean,
278
- subplot_titles=None,
279
- n_words=5,
280
- top_n_topics=8,
281
- height=300
282
- )
283
  st.plotly_chart(fig, use_container_width=True)
284
 
285
  st.write('''
@@ -301,14 +300,9 @@ labelled_topics = [
301
  'Vikas Krishan (Indian Boxer)',
302
  ]
303
 
304
- # plot topics using clean data with stopwords removed
305
- fig = LIB.visualize_barchart_titles(
306
- topic_model=topic_model,
307
- subplot_titles=labelled_topics,
308
- n_words=5,
309
- top_n_topics=8,
310
- height=300
311
- )
312
  st.plotly_chart(fig, use_container_width=True)
313
 
314
  st.write('''
@@ -366,9 +360,184 @@ st.markdown('''---''')
366
 
367
 
368
  st.header('Classifiying Text Responses and Sentiment Analysis')
369
- st.write('''
370
  With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
371
- E.g.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  ''')
374
  st.write('\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
  import plotly.express as px
12
+ import pickle
13
 
14
  # factor analysis
15
  from factor_analyzer import FactorAnalyzer
 
19
 
20
  # nlp
21
  from bertopic import BERTopic
22
+ from transformers import pipeline
23
 
24
  # custom
25
  import survey_analytics_library as LIB
 
40
  data_survey, data_questions = read_survey_data()
41
 
42
  @st.cache
43
+ def read_tokyo_data():
44
  tokyo = pd.read_csv(data_path+'tokyo_olympics_tweets.csv')
45
  return tokyo
46
+ tokyo = read_tokyo_data()
 
 
 
 
 
 
47
 
48
  @st.cache(allow_output_mutation=True)
49
  def load_bertopic_model():
 
57
  return topic_results
58
  topic_results = read_topic_results()
59
 
60
+ @st.cache
61
+ def read_climate_change_results():
62
+ sentiment_results = pd.read_csv(data_path+'sentiment_results.csv')
63
+ zero_shot_results = pd.read_csv(data_path+'zero_shot_results.csv')
64
+ return sentiment_results, zero_shot_results
65
+ sentiment_results, zero_shot_results = read_climate_change_results()
66
+
67
+
68
  # write title of app
69
  st.title('DACoP - Survey Analytics')
70
  st.markdown('''---''')
 
276
  ''')
277
  st.write('\n')
278
 
279
+ # load and plot topics using unclean data
280
+ with open('data/topics_tokyo_unclean.pickle', 'rb') as pkl:
281
+ fig = pickle.load(pkl)
 
 
 
 
 
282
  st.plotly_chart(fig, use_container_width=True)
283
 
284
  st.write('''
 
300
  'Vikas Krishan (Indian Boxer)',
301
  ]
302
 
303
+ # load plot topics using clean data with stopwords removed
304
+ with open('data/topics_tokyo.pickle', 'rb') as pkl:
305
+ fig = pickle.load(pkl)
 
 
 
 
 
306
  st.plotly_chart(fig, use_container_width=True)
307
 
308
  st.write('''
 
360
 
361
 
362
  st.header('Classifiying Text Responses and Sentiment Analysis')
363
+ st.write(f'''
364
  With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
365
+ An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
366
+ Using **Zero-shot Classification**, we can classify responses into one of these four categories.
367
+ As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
368
+ We'll use a different set of {len(sentiment_results):,} tweets related to climate change.
369
+ ''')
370
+ st.write('\n')
371
+
372
+ # rename column
373
+ sentiment_results = sentiment_results.rename(columns={'sequence':'Tweet'})
374
+ st.dataframe(sentiment_results[['Tweet']])
375
+
376
+ @st.cache(allow_output_mutation=True)
377
+ def load_transfomer_pipelines():
378
+ classifier_zero_shot = pipeline(
379
+ task='zero-shot-classification',
380
+ model='valhalla/distilbart-mnli-12-1',
381
+ return_all_scores=True
382
+ )
383
+ classifier_sentiment = pipeline(
384
+ task='sentiment-analysis',
385
+ model = 'distilbert-base-uncased-finetuned-sst-2-english',
386
+ return_all_scores=True
387
+ )
388
+ return classifier_zero_shot, classifier_sentiment
389
+ classifier_zero_shot, classifier_sentiment = load_transfomer_pipelines()
390
+
391
+ # define candidate labels
392
+ candidate_labels = [
393
+ 'finance',
394
+ 'politics',
395
+ 'technology',
396
+ 'wildlife',
397
+ ]
398
+
399
+ # define sample tweet
400
+ sample_tweet_index = 5000
401
+
402
+ # define the first and last topic number
403
+ # create range of index
404
+ tweet_index = sentiment_results.index
405
+ first_tweet = tweet_index[0]
406
+ last_tweet = tweet_index[-1]
407
+
408
+ st.write(f'''
409
+ As a demonstration, we'll define some categories and pick a tweet to classify and determine its sentiment.
410
+ Feel free to add your own categories or even input your own text!
411
+ ''')
412
+
413
+ # interactive input for user to define candidate labels and tweet index for analysis
414
+ with st.form('classify_tweets'):
415
+ # input for labels
416
+ user_defined_labels = st.text_input('Enter categories (separate categories by comma):', ', '.join(candidate_labels))
417
+ candidate_labels = user_defined_labels
418
+ # input for tweet index
419
+ user_define_tweet = st.number_input(f'Enter tweet index (from {first_tweet} to {last_tweet}) to classify:', min_value=first_tweet, max_value=last_tweet, value=sample_tweet_index)
420
+ sample_tweet_index = user_define_tweet
421
+ sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
422
+ # input for user defined text
423
+ user_defined_input = st.text_input('Enter custom text (optional, leave blank to use Tweets):', '')
424
+ # check if user has entered any custom text
425
+ # if user_define_input is not blank, then override sample_tweet
426
+ if user_defined_input:
427
+ sample_tweet = user_defined_input
428
+
429
+ # submit form
430
+ submit = st.form_submit_button('Classify Tweet')
431
+
432
+ st.write('\n')
433
+ st.write(f'''
434
+ Here are the results:
435
+ ''')
436
+ st.write(f'Input Text: *\'{sample_tweet}\'*')
437
+
438
+ # get predictions from models
439
+ zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
440
+ sentiment_sample = classifier_sentiment(sample_tweet)
441
+
442
+ # get sentiment
443
+ sentiment_sample = sentiment_sample[1].get('score')
444
+ sentiment_label = 'positive'
445
+ if sentiment_sample < 0.5:
446
+ sentiment_label = 'negative'
447
+
448
+ st.write(f'''
449
+ The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
450
+ Main category score ranges from 0 to 1, with 1 being very likely.
451
+
452
+ The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
453
+ Full set of scores cores add up to 1.
454
+
455
+ The sentiment is: **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
456
+ Sentiment score ranges from 0 to 1, with 1 being very positive.
457
+ ''')
458
+ st.write('\n')
459
+ st.write('\n')
460
 
461
+ # drop unused columns and rename columns
462
+ zero_shot_results = zero_shot_results.drop('labels_scores', axis=1)
463
+ zero_shot_results = zero_shot_results.rename(columns={'sequence':'tweet', 'label':'category'})
464
+ st.write(f'''
465
+ Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
466
+ ''')
467
+
468
+ st.dataframe(zero_shot_results)
469
+
470
+ st.write(f'''
471
+ We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
472
+ It is likely that the tweet does not natually fall into one of the defined categories.
473
+ Before performing further analysis on our results, we can set a score threshold to only keep predictions that we're confident in.
474
+ ''')
475
+ st.write('\n')
476
+
477
+ # interactive input for user to define candidate labels and tweet index for analysis
478
+ with st.form('classification_score_threshold'):
479
+ user_defined_threshold = st.number_input('Enter score threshold (between 0.01 and 0.99):', min_value=0.01, max_value=0.99, value=0.7, step=0.05)
480
+ # submit form
481
+ submit = st.form_submit_button('Set Threshold')
482
+ st.write('\n')
483
+
484
+ # filter and keep results with score above defined threshold
485
+ zero_shot_results_clean = zero_shot_results.loc[(zero_shot_results['score'] >= user_defined_threshold)].copy()
486
+
487
+ # rename columns
488
+ sentiment_results.columns = ['tweet', 'sentiment']
489
+
490
+ st.write(f'''
491
+ The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
492
+ Out of the 10,000 tweets, we are now left with {len(zero_shot_results_clean)}.
493
+ We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
494
+ ''')
495
+
496
+ # merge in sentiment score on index
497
+ # drop unused columns
498
+ classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
499
+ classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
500
+ st.dataframe(classification_sentiment_df)
501
+
502
+ st.write(f'''
503
+ The difficult part for zero-shot classification is defining the right set of categories for each business case.
504
+ Some trial and error is required to find the appropriate words that can return the optimal results.
505
  ''')
506
  st.write('\n')
507
+
508
+ # group by category, count tweets and get mean of sentiment
509
+ classification_sentiment_agg = classification_sentiment_df.groupby(['category']).agg({'tweet':'count', 'sentiment':'mean'}).reset_index()
510
+ classification_sentiment_agg = classification_sentiment_agg.rename(columns={'tweet':'count'})
511
+
512
+ st.write(f'''
513
+ Finally, we can visualise the percentage of tweets in each category and the respective average sentiment scores.
514
+ ''')
515
+
516
+ fig = px.pie(
517
+ classification_sentiment_agg,
518
+ values='count',
519
+ names='category',
520
+ hole=0.35,
521
+ title='Percentage of Tweets in Each Category',
522
+ template='simple_white',
523
+ width=1000,
524
+ height=600
525
+ )
526
+ fig.update_traces(textposition='inside', textinfo='percent+label')
527
+ st.plotly_chart(fig)
528
+
529
+ fig = px.bar(
530
+ classification_sentiment_agg,
531
+ x='category',
532
+ y='sentiment',
533
+ title='Average Sentiment of Tweets in Each Category <br><sup>Overall, the sentiment of the tweets are on the negative side.</sup>',
534
+ template='simple_white',
535
+ width=1000,
536
+ height=600
537
+ )
538
+ fig.update_yaxes(range=[0, 1])
539
+ fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
540
+ st.plotly_chart(fig)
541
+
542
+ st.write('\n')
543
+ st.markdown('''---''')
data/climate_change_tweets.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/sentiment_results.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/tokyo_topics.csv ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Topic,Count,Name
2
+ -1,2624,-1_silver_medal_proud_mirabaichanu
3
+ 0,428,0_banda_zambia_barbra_barbra banda
4
+ 1,356,1_india_proud_indians_moment
5
+ 2,296,2_sutirtha_mukherjee_sutirtha mukherjee_tabletennis
6
+ 3,287,3_mirabaichanu hearty_lifting womens_mirabaichanu lifting_hearty congratulations
7
+ 4,248,4_race_road_road race_carapaz
8
+ 5,210,5_japan_volleyball_venezuela_flag
9
+ 6,195,6_kerr_sam_sam kerr_matildas
10
+ 7,183,7_vikas_boxing_krishan_vikas krishan
11
+ 8,163,8_gymnastics_mens gymnastics_max_whitlock
12
+ 9,148,9_tennis_murray_singles_nagal
13
+ 10,121,10_bbc_coverage_live_bbcsport
14
+ 11,120,11_ina_facebook_action_officialvkyadav
15
+ 12,115,12_puneethrajkumar cheer4india_dvitva james_dvitva_james puneethrajkumar
16
+ 13,113,13_hockey_south africa_gbhockey_africa
17
+ 14,100,14_judo_takato_gold_japans
18
+ 15,97,15_chanu_mirabai chanu_chanu wins_mirabai
19
+ 16,92,16_swimming_swimming swimming_aquatics_finals
20
+ 17,89,17_medal weightlifting_mirabaichanu winning_ace indian_congratulations ace
21
+ 18,87,18_q2_canwnt_corner_follow live
22
+ 19,85,19_winning medal_indias medal_medal india_medal
23
+ 20,84,20_basketball_3x3_3x3 basketball_usa
24
+ 21,80,21_butterfly_100m_heat_100m butterfly
25
+ 22,78,22_weightlifter_weightlifter mirabai_chanu_mirabai chanu
26
+ 23,78,23_kosovo_distria_krasniqi_distria krasniqi
27
+ 24,77,24_swevaus_damn_furniture_swevaus fuck
28
+ 25,75,25_yulo_carlos_carlos yulo_rings
29
+ 26,71,26_ceremony_opening ceremony_opening_drones
30
+ 27,69,27_medal ongoing_ongoing_winning indias_indias medal
31
+ 28,64,28_teamgb_gb_come_team gb
32
+ 29,62,29_sweden_swedes_swevaus_swedish
33
+ 30,61,30_sweden_australia_rolfo_fridolina
34
+ 31,60,31_japan_britain_great britain_japan great
35
+ 32,59,32_rule_remedy_remedy rule_butterfly
36
+ 33,55,33_silver medal_winning silver_silver_mirabaichanu winning
37
+ 34,52,34_mirabaichanu proud_proud_proud mirabaichanu_mirabaichanu
38
+ 35,51,35_chile_canada_beckie_janine
39
+ 36,51,36_mediasai_virenrasquinha imrahultrehan_iosindiaoff virenrasquinha_iosindiaoff
40
+ 37,49,37_clareburt_lewis_lewis clareburt_kalisz
41
+ 38,49,38_dressage_equestrian_horse_equestrian dressage
42
+ 39,47,39_mirabaichanu wins_49kg category_category_india snatches
43
+ 40,47,40_imrahultrehan congratulations_mirabaichanu mediasai_railminindia_iosindiaoff
44
+ 41,47,41_silver medal_tally_silver_medals tally
45
+ 42,47,42_penalty_penalty china_ref_referee
46
+ 43,45,43_country proud_medal country_country_winning silver
47
+ 44,45,44_teammalaysia_teamindia_teamina_congrats teammalaysia
48
+ 45,44,45_daddies_badminton_daddies badminton_ina
49
+ 46,44,46_chirag_rankireddy_shetty_chirag shetty
50
+ 47,44,47_countrys medal_bringing glory_glory medal_countrys
51
+ 48,43,48_medals_1001_medals won_1001 1001
52
+ 49,43,49_badminton_badmintonmalaysia_ina_wooi yik
53
+ 50,42,50_achieving medal_mirabaichanu achieving_achieving_medal india
54
+ 51,41,51_badminton_malaysia_double_sokongmalaysia
55
+ 52,41,52_sleep_saturday_hours_watch
56
+ 53,41,53_cheer4india_teamindia_da boys_teamindia best
57
+ 54,40,54_sweaus_sweaus football_swe_aus
58
+ 55,40,55_pistol_10m_air pistol_air
59
+ 56,39,56_medal weightlifting_winning silver_weightlifting_silver medal
60
+ 57,38,57_silver india_silver_india_mirabaichanu silver
61
+ 58,37,58_flying start_flying_start huge_huge congratulations
62
+ 59,36,59_archery_mixed team_korea_mixed
63
+ 60,35,60_covid19_covid_paralympics_test
64
+ 61,35,61_athletes_olympians_proud athletes_congratulations joebrier99
65
+ 62,35,62_penalty_swevaus_penalty swevaus_swevaus penalty
66
+ 63,35,63_pakistan_uae_athletes_afghanistan
67
+ 64,34,64_asked_asked happier_india elated_happier start
68
+ 65,34,65_smith_brendon_brendon smith_swim
69
+ 66,33,66_matildas_sweden_matildas sweden_attacking
70
+ 67,32,67_mirabaichanu cheer4india_cheer4india_cheer4india mirabaichanu_mirabaichanu congratulations
71
+ 68,32,68_day mirabaichanu_indias 1st_medal day_weightlifting india
72
+ 69,31,69_boxing_boxers_welterweights_delante
73
+ 70,31,70_loving_let party_officially held_waiting gymnastics
74
+ 71,31,71_400m_mens 400m_heat_400
75
+ 72,30,72_malaysia_malaysiaboleh_malaysiaboleh congrats_malaysia malaysia
76
+ 73,30,73_time india_india clinches_clinches medal_day hearty
77
+ 74,30,74_silver medal_medal india_silver_india
78
+ 75,30,75_mirabai chanu_mirabai_chanu_saikhom mirabai
79
+ 76,30,76_football_womens football_soccer_women
80
+ 77,30,77_mcgrail_peter mcgrail_peter_butdee
81
+ 78,29,78_display weightlifting_amazing display_absolutely amazing_display
82
+ 79,29,79_cheer4india_medal cheer4india_indias mirabaichanu_medal medal
83
+ 80,29,80_mirabaichanu teamindia_teamindia_teamindia mirabaichanu_proud teamindia
84
+ 81,29,81_spain_waterpolo_water polo_polo
85
+ 82,29,82_daddies_daddies daddies_daddies victory_mantap daddies
86
+ 83,28,83_pen_pen swevaus_swevaus pen_swevaus
87
+ 84,28,84_mirabaichanu mirabaichanu_mirabaichanu_congratulations mirabaichanu_power
88
+ 85,27,85_congratulations mirabai_chanu winning_mirabai chanu_49 kg
89
+ 86,27,86_silver weightlifting_huge congratulations_huge_winning silver
90
+ 87,27,87_qian_yang qian_yang_chinas
91
+ 88,27,88_medal womens_category_49kg_winning silver
92
+ 89,27,89_potential_massive potential_long term_term quick
93
+ 90,26,90_matildas_fark_pen matildas_matildas matildas
94
+ 91,26,91_grande_carapaz_hispanos_grande carapaz
95
+ 92,25,92_gift selflove_mensfashion_selflove_selfie mensfashion
96
+ 93,24,93_matildas_matildas swevaus_swevaus_swevaus matildas
97
+ 94,24,94_49_womens 49_49 kgs_kgs
98
+ 95,24,95_thematildas_goaustralia_thematildas samkerr1_goaustralia thematildas
99
+ 96,23,96_new zealand_zealand_hockey_new
100
+ 97,23,97_chanu secured_secured medal_secured_country winning
101
+ 98,23,98_weightlifting lets_lets cheer_cheer india_cheer
102
+ 99,23,99_raymondcupid kyereminator_kyereminator daterush_watch hisbella4_kyereminator
103
+ 100,22,100_mirabaichanu silver_silver_silver mirabaichanu_mam silver
104
+ 101,22,101_nigeria_ghana_team_ghanas
105
+ 102,22,102_aus_ausvswe_aussies_australia
106
+ 103,22,103_winning silver_mirabaichanu winning_weightlifting medal_silver weightlifting
107
+ 104,22,104_teamindia 49kg_silver medal_mirabaichanu won_medal radiant
108
+ 105,21,105_swimming_bbcsport_bbc_swimming heats
109
+ 106,21,106_mirabaichanu weightlifting_weightlifting_india mirabaichanu_india
110
+ 107,21,107_mirabaichanu weightlifting_weightlifting_spirits_bow
111
+ 108,21,108_history mirabai_teamindia mirabaichanu_chanu won_medal teamindia
112
+ 109,21,109_giochiolimpici_forzaazzurri_olimpiadi forzaazzurri_olimpiadi
113
+ 110,21,110_handball_portugal_egypt_esp
114
+ 111,21,111_seto_daiya_daiya seto_shock
115
+ 112,21,112_congratulations mirabai_chanu winning_chanu_mirabai
116
+ 113,21,113_brazil_netherlands_netherlands brazil_brazil womens
117
+ 114,20,114_mohanlal_mirabaichanu congratulations_mohanlal mirabaichanu_winning indias
118
+ 115,20,115_day congratulations_congratulations saikhom_49kg weightlift_weightlift
119
+ 116,20,116_saikhom_saikhom mirabai_congratulations saikhom_chanu winning
120
+ 117,20,117_dreams_criticism blood_sacrifice_criticism
121
+ 118,20,118_peaty_adam_adam peaty_adampeaty
122
+ 119,19,119_actor_medal winner_favourite actor_winner mirabaichanu
123
+ 120,19,120_peng_ying_chan peng_chan
124
+ 121,19,121_taekwondo_jin_barbosa_kurt
125
+ 122,18,122_fencing_samele fencing_samele_2nd round
126
+ 123,18,123_congratulated winning_mirabaichanu congratulated_congratulated_spoke
127
+ 124,18,124_strikes medal_india strikes_strikes_medal 49
128
+ 125,17,125_mirabaichanu comes_comes india_mohanlal mirabaichanu_mohanlal
129
+ 126,15,126_carrying_moment_proud moment_proud
130
+ 127,15,127_cheer4india_country cheer4india_teamindia_medal teamindia
131
+ 128,15,128_medal mirabai_kg womens_medal 49_womens weightlifting
data/topics_tokyo.pickle ADDED
Binary file (9.44 kB). View file
 
data/topics_tokyo_unclean.pickle ADDED
Binary file (9.34 kB). View file
 
data/zero_shot_results.csv ADDED
The diff for this file is too large to render. See raw diff
 
models/bertopic_model_tokyo_olympics_tweets_unclean DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0de856ed231c12e7baeaff15eb3159e1a5ef7c5512b459f915f46712f6d203a3
3
- size 71961846
 
 
 
 
models/distilbart-mnli-12-1/README.md DELETED
@@ -1,59 +0,0 @@
1
- ---
2
- datasets:
3
- - mnli
4
- tags:
5
- - distilbart
6
- - distilbart-mnli
7
- pipeline_tag: zero-shot-classification
8
- ---
9
-
10
- # DistilBart-MNLI
11
-
12
- distilbart-mnli is the distilled version of bart-large-mnli created using the **No Teacher Distillation** technique proposed for BART summarisation by Huggingface, [here](https://github.com/huggingface/transformers/tree/master/examples/seq2seq#distilbart).
13
-
14
- We just copy alternating layers from `bart-large-mnli` and finetune more on the same data.
15
-
16
-
17
- | | matched acc | mismatched acc |
18
- | ------------------------------------------------------------------------------------ | ----------- | -------------- |
19
- | [bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli) (baseline, 12-12) | 89.9 | 90.01 |
20
- | [distilbart-mnli-12-1](https://huggingface.co/valhalla/distilbart-mnli-12-1) | 87.08 | 87.5 |
21
- | [distilbart-mnli-12-3](https://huggingface.co/valhalla/distilbart-mnli-12-3) | 88.1 | 88.19 |
22
- | [distilbart-mnli-12-6](https://huggingface.co/valhalla/distilbart-mnli-12-6) | 89.19 | 89.01 |
23
- | [distilbart-mnli-12-9](https://huggingface.co/valhalla/distilbart-mnli-12-9) | 89.56 | 89.52 |
24
-
25
-
26
- This is a very simple and effective technique, as we can see the performance drop is very little.
27
-
28
- Detailed performace trade-offs will be posted in this [sheet](https://docs.google.com/spreadsheets/d/1dQeUvAKpScLuhDV1afaPJRRAE55s2LpIzDVA5xfqxvk/edit?usp=sharing).
29
-
30
-
31
- ## Fine-tuning
32
- If you want to train these models yourself, clone the [distillbart-mnli repo](https://github.com/patil-suraj/distillbart-mnli) and follow the steps below
33
-
34
- Clone and install transformers from source
35
- ```bash
36
- git clone https://github.com/huggingface/transformers.git
37
- pip install -qqq -U ./transformers
38
- ```
39
-
40
- Download MNLI data
41
- ```bash
42
- python transformers/utils/download_glue_data.py --data_dir glue_data --tasks MNLI
43
- ```
44
-
45
- Create student model
46
- ```bash
47
- python create_student.py \
48
- --teacher_model_name_or_path facebook/bart-large-mnli \
49
- --student_encoder_layers 12 \
50
- --student_decoder_layers 6 \
51
- --save_path student-bart-mnli-12-6 \
52
- ```
53
-
54
- Start fine-tuning
55
- ```bash
56
- python run_glue.py args.json
57
- ```
58
-
59
- You can find the logs of these trained models in this [wandb project](https://wandb.ai/psuraj/distilbart-mnli).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/distilbart-mnli-12-1/config.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "_num_labels": 3,
3
- "activation_dropout": 0.0,
4
- "activation_function": "gelu",
5
- "add_bias_logits": false,
6
- "add_final_layer_norm": false,
7
- "architectures": [
8
- "BartForSequenceClassification"
9
- ],
10
- "attention_dropout": 0.1,
11
- "bos_token_id": 0,
12
- "classif_dropout": 0.0,
13
- "classifier_dropout": 0.0,
14
- "d_model": 1024,
15
- "decoder_attention_heads": 16,
16
- "decoder_ffn_dim": 4096,
17
- "decoder_layerdrop": 0.0,
18
- "decoder_layers": 1,
19
- "decoder_start_token_id": 2,
20
- "dropout": 0.1,
21
- "encoder_attention_heads": 16,
22
- "encoder_ffn_dim": 4096,
23
- "encoder_layerdrop": 0.0,
24
- "encoder_layers": 12,
25
- "eos_token_id": 2,
26
- "extra_pos_embeddings": 2,
27
- "finetuning_task": "mnli",
28
- "force_bos_token_to_be_generated": false,
29
- "forced_eos_token_id": 2,
30
- "gradient_checkpointing": false,
31
- "id2label": {
32
- "0": "contradiction",
33
- "1": "neutral",
34
- "2": "entailment"
35
- },
36
- "init_std": 0.02,
37
- "is_encoder_decoder": true,
38
- "label2id": {
39
- "contradiction": 0,
40
- "entailment": 2,
41
- "neutral": 1
42
- },
43
- "max_position_embeddings": 1024,
44
- "model_type": "bart",
45
- "normalize_before": false,
46
- "normalize_embedding": true,
47
- "num_hidden_layers": 12,
48
- "output_past": false,
49
- "pad_token_id": 1,
50
- "scale_embedding": false,
51
- "static_position_embeddings": false,
52
- "total_flos": 153130534133111808,
53
- "transformers_version": "4.7.0.dev0",
54
- "use_cache": true,
55
- "vocab_size": 50265
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/distilbart-mnli-12-1/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
models/distilbart-mnli-12-1/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa79ff59084a5036b07a9cffeaa1b1b7c1aa5edeb1885416a734c001a09aa046
3
- size 890410947
 
 
 
 
models/distilbart-mnli-12-1/special_tokens_map.json DELETED
@@ -1 +0,0 @@
1
- {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
 
 
models/distilbart-mnli-12-1/tokenizer_config.json DELETED
@@ -1 +0,0 @@
1
- {"model_max_length": 1024}
 
 
models/distilbart-mnli-12-1/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
models/distilbert-base-uncased-finetuned-sst-2-english/README.md DELETED
@@ -1,31 +0,0 @@
1
- ---
2
- language: en
3
- license: apache-2.0
4
- datasets:
5
- - sst-2
6
- ---
7
-
8
- # DistilBERT base uncased finetuned SST-2
9
-
10
- This model is a fine-tune checkpoint of [DistilBERT-base-uncased](https://huggingface.co/distilbert-base-uncased), fine-tuned on SST-2.
11
- This model reaches an accuracy of 91.3 on the dev set (for comparison, Bert bert-base-uncased version reaches an accuracy of 92.7).
12
-
13
- For more details about DistilBERT, we encourage users to check out [this model card](https://huggingface.co/distilbert-base-uncased).
14
-
15
- # Fine-tuning hyper-parameters
16
-
17
- - learning_rate = 1e-5
18
- - batch_size = 32
19
- - warmup = 600
20
- - max_seq_length = 128
21
- - num_train_epochs = 3.0
22
-
23
- # Bias
24
-
25
- Based on a few experimentations, we observed that this model could produce biased predictions that target underrepresented populations.
26
-
27
- For instance, for sentences like `This film was filmed in COUNTRY`, this binary classification model will give radically different probabilities for the positive label depending on the country (0.89 if the country is France, but 0.08 if the country is Afghanistan) when nothing in the input indicates such a strong semantic shift. In this [colab](https://colab.research.google.com/gist/ageron/fb2f64fb145b4bc7c49efc97e5f114d3/biasmap.ipynb), [Aurélien Géron](https://twitter.com/aureliengeron) made an interesting map plotting these probabilities for each country.
28
-
29
- <img src="https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/map.jpeg" alt="Map of positive probabilities per country." width="500"/>
30
-
31
- We strongly advise users to thoroughly probe these aspects on their use-cases in order to evaluate the risks of this model. We recommend looking at the following bias evaluation datasets as a place to start: [WinoBias](https://huggingface.co/datasets/wino_bias), [WinoGender](https://huggingface.co/datasets/super_glue), [Stereoset](https://huggingface.co/datasets/stereoset).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/distilbert-base-uncased-finetuned-sst-2-english/config.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "activation": "gelu",
3
- "architectures": [
4
- "DistilBertForSequenceClassification"
5
- ],
6
- "attention_dropout": 0.1,
7
- "dim": 768,
8
- "dropout": 0.1,
9
- "finetuning_task": "sst-2",
10
- "hidden_dim": 3072,
11
- "id2label": {
12
- "0": "NEGATIVE",
13
- "1": "POSITIVE"
14
- },
15
- "initializer_range": 0.02,
16
- "label2id": {
17
- "NEGATIVE": 0,
18
- "POSITIVE": 1
19
- },
20
- "max_position_embeddings": 512,
21
- "model_type": "distilbert",
22
- "n_heads": 12,
23
- "n_layers": 6,
24
- "output_past": true,
25
- "pad_token_id": 0,
26
- "qa_dropout": 0.1,
27
- "seq_classif_dropout": 0.2,
28
- "sinusoidal_pos_embds": false,
29
- "tie_weights_": true,
30
- "vocab_size": 30522
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/distilbert-base-uncased-finetuned-sst-2-english/map.jpeg DELETED
Binary file (81.6 kB)
 
models/distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:60554cbd7781b09d87f1ececbea8c064b94e49a7f03fd88e8775bfe6cc3d9f88
3
- size 267844284
 
 
 
 
models/distilbert-base-uncased-finetuned-sst-2-english/tokenizer_config.json DELETED
@@ -1 +0,0 @@
1
- {"model_max_length": 512, "do_lower_case": true}
 
 
models/distilbert-base-uncased-finetuned-sst-2-english/vocab.txt DELETED
The diff for this file is too large to render. See raw diff
 
survey_analytics_library.py CHANGED
@@ -18,126 +18,6 @@ from nltk.corpus import stopwords
18
 
19
 
20
 
21
- # # create elbow plot with kmeans to find optimal number of clusters
22
- # def create_elbow_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
23
- # '''
24
- # create elbow plot with kmeans to find optimal number of clusters based on inertia
25
- # where the clusters strikes a balance between being not segmented enough and being too fragmented
26
-
27
- # we look for the point of diminishing returns (also known as the 'elbow') in terms of the inertia,
28
- # where inertia is how close the data points are to their respective centers or centroids
29
-
30
- # arguments:
31
- # df (df): a dataframe of data to cluster
32
- # num_clusters (int): number of clusters to plot
33
- # init_method (str): default to 'k-means++', other option is 'random'
34
- # n_init (int): default to 10, number of times to run model, cost from the best run will be used
35
- # random_state (int): default to 42, random seed used to initialise the model
36
- # plot (bool): default to True, option to turn off plots
37
- # template (str): default to 'simple_white', change as desired
38
- # save (bool): default to False, if True save plot as .html file
39
-
40
- # returns:
41
- # a list of inertia for each run
42
- # '''
43
-
44
- # # create empty list to store inertia for each run
45
- # inertia = []
46
- # # define range of clusters to try
47
- # k = range(2, num_clusters+1)
48
-
49
- # # loop through number of clusters
50
- # for num_clusters in tqdm(k):
51
- # # define model
52
- # kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
53
- # # fit and predict data
54
- # kmeans.fit_predict(df)
55
- # # get predicted labels
56
- # predicted_labels = kmeans.labels_
57
- # # append score to list of scores
58
- # inertia.append(kmeans.inertia_)
59
-
60
- # # plot elbow plot
61
- # if plot:
62
- # fig = px.line(
63
- # pd.DataFrame({'num_clusters':list(k), 'inertia':inertia}),
64
- # x='num_clusters',
65
- # y='inertia',
66
- # title='Elbow Plot for Optimal Number of Clusters with '+init_method,
67
- # markers=True,
68
- # template=template,
69
- # width=800,
70
- # height=500,
71
- # )
72
- # st.plotly_chart(fig, use_container_width=True)
73
- # if save:
74
- # fig.write_html('Elbow Plot for Optimal Number of Clusters with '+init_method+'.html')
75
-
76
- # # return
77
- # return inertia
78
-
79
-
80
-
81
- # # create plot of silhouette scores with sklearn model to find optimal number of clusters
82
- # def silhouette_score_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
83
- # '''
84
- # create plot of silhouette score with kmeans to find optimal number of clusters
85
- # where the clusters strikes a balance between being not segmented enough and being too fragmented
86
- # the closer the score is to 1, the more easily distinguishable are the clusters from each other
87
-
88
- # arguments:
89
- # df (df): a dataframe of data to cluster
90
- # num_clusters (int): number of clusters to plot
91
- # init_method (str): default to 'k-means++', other option is 'random'
92
- # n_init (int): default to 10, number of times to run model, cost from the best run will be used
93
- # random_state (int): default to 42, random seed used to initialise the model
94
- # plot (bool): default to True, option to turn off plots
95
- # template (str): default to 'simple_white', change as desired
96
- # save (bool): default to False, if True save plot as .html file
97
-
98
- # returns:
99
- # a list of silhouette scores for each run
100
- # '''
101
-
102
- # # create empty list to store silhoutte scores for each run
103
- # silhouette_scores = []
104
- # # define range of clusters to try
105
- # k = range(2, num_clusters+1)
106
-
107
- # # loop through number of clusters
108
- # for num_clusters in tqdm(k):
109
- # # define model
110
- # kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
111
- # # fit and predict data
112
- # kmeans.fit_predict(df)
113
- # # get predicted labels
114
- # predicted_labels = kmeans.labels_
115
- # # get silhoutte score
116
- # score = silhouette_score(df, predicted_labels)
117
- # # append score to list of scores
118
- # silhouette_scores.append(score)
119
-
120
- # # plot silhouette scores
121
- # if plot:
122
- # fig = px.line(
123
- # pd.DataFrame({'num_clusters':list(k), 'silhouette_scores':silhouette_scores}),
124
- # x='num_clusters',
125
- # y='silhouette_scores',
126
- # title='Silhouette Scores for Optimal Number of Clusters with '+init_method,
127
- # markers=True,
128
- # template=template,
129
- # width=800,
130
- # height=500,
131
- # )
132
- # st.plotly_chart(fig, use_container_width=True)
133
- # if save:
134
- # fig.write_html('Silhouette Scores for Optimal Number of Clusters with '+init_method+'.html')
135
-
136
- # # return
137
- # return silhouette_scores
138
-
139
-
140
-
141
  # replace text with multiple replacements
142
  def replace_text(string, dict_of_replacements):
143
  '''
@@ -379,5 +259,41 @@ def convert_zero_shot_classification_output_to_dataframe(model_output):
379
  # drop unused columns
380
  results = results.drop(['labels', 'scores'], axis=1)
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  # return
383
  return results
 
18
 
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # replace text with multiple replacements
22
  def replace_text(string, dict_of_replacements):
23
  '''
 
259
  # drop unused columns
260
  results = results.drop(['labels', 'scores'], axis=1)
261
 
262
+ # return
263
+ return results
264
+
265
+
266
+ # convert transformer model sentiment classification prediction into dataframe
267
+ def convert_sentiment_classification_output_to_dataframe(text_input, model_output):
268
+ '''
269
+ convert sentiment classification output into a dataframe
270
+
271
+ the model used distilbert-base-uncased-finetuned-sst-2-english outputs a list of lists with two dictionaries,
272
+ within each dictionary is a label negative or postive and the respective score
273
+ [
274
+ [
275
+ {'label': 'NEGATIVE', 'score': 0.18449656665325165},
276
+ {'label': 'POSITIVE', 'score': 0.8155034780502319}
277
+ ],
278
+ ...
279
+ ]
280
+ the scores sum up to 1, and we extract only the positive score in this function,
281
+ append the scores to the model's input and return a dataframe
282
+
283
+ arguments:
284
+ text_input (list): a list of sequences that is input for the model
285
+ model_output (list): a list of labels and scores
286
+
287
+ return:
288
+ a dataframe of sequences and sentiment score
289
+
290
+ '''
291
+ # store model positive scores as dataframe
292
+ results = pd.DataFrame(model_output)[[1]]
293
+ # get score from column
294
+ results = results[1].apply(lambda x: x.get('score'))
295
+ # store input sequences and scores as dataframe
296
+ results = pd.DataFrame({'sequence':text_input, 'score':results})
297
+
298
  # return
299
  return results