Spaces:
Runtime error
Runtime error
update
Browse files- .gitattributes +1 -5
- app.py +195 -26
- data/climate_change_tweets.csv +0 -0
- data/sentiment_results.csv +0 -0
- data/tokyo_topics.csv +131 -0
- data/topics_tokyo.pickle +0 -0
- data/topics_tokyo_unclean.pickle +0 -0
- data/zero_shot_results.csv +0 -0
- models/bertopic_model_tokyo_olympics_tweets_unclean +0 -3
- models/distilbart-mnli-12-1/README.md +0 -59
- models/distilbart-mnli-12-1/config.json +0 -56
- models/distilbart-mnli-12-1/merges.txt +0 -0
- models/distilbart-mnli-12-1/pytorch_model.bin +0 -3
- models/distilbart-mnli-12-1/special_tokens_map.json +0 -1
- models/distilbart-mnli-12-1/tokenizer_config.json +0 -1
- models/distilbart-mnli-12-1/vocab.json +0 -0
- models/distilbert-base-uncased-finetuned-sst-2-english/README.md +0 -31
- models/distilbert-base-uncased-finetuned-sst-2-english/config.json +0 -31
- models/distilbert-base-uncased-finetuned-sst-2-english/map.jpeg +0 -0
- models/distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin +0 -3
- models/distilbert-base-uncased-finetuned-sst-2-english/tokenizer_config.json +0 -1
- models/distilbert-base-uncased-finetuned-sst-2-english/vocab.txt +0 -0
- survey_analytics_library.py +36 -120
.gitattributes
CHANGED
@@ -25,8 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
models/bertopic_model_tokyo_olympics_tweets filter=lfs diff=lfs merge=lfs -text
|
29 |
-
models/bertopic_model_tokyo_olympics_tweets_unclean filter=lfs diff=lfs merge=lfs -text
|
30 |
-
models/distilbart-mnli-12-1/flax_model.msgpack filter=lfs diff=lfs merge=lfs -text
|
31 |
-
models/distilbart-mnli-12-1/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
32 |
-
models/distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
models/bertopic_model_tokyo_olympics_tweets filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -9,6 +9,7 @@ import os
|
|
9 |
import matplotlib.pyplot as plt
|
10 |
import seaborn as sns
|
11 |
import plotly.express as px
|
|
|
12 |
|
13 |
# factor analysis
|
14 |
from factor_analyzer import FactorAnalyzer
|
@@ -18,6 +19,7 @@ from scipy.stats import zscore
|
|
18 |
|
19 |
# nlp
|
20 |
from bertopic import BERTopic
|
|
|
21 |
|
22 |
# custom
|
23 |
import survey_analytics_library as LIB
|
@@ -38,16 +40,10 @@ def read_survey_data():
|
|
38 |
data_survey, data_questions = read_survey_data()
|
39 |
|
40 |
@st.cache
|
41 |
-
def
|
42 |
tokyo = pd.read_csv(data_path+'tokyo_olympics_tweets.csv')
|
43 |
return tokyo
|
44 |
-
tokyo =
|
45 |
-
|
46 |
-
@st.cache(allow_output_mutation=True)
|
47 |
-
def load_bertopic_model_unclean():
|
48 |
-
topic_model = BERTopic.load(model_path+'bertopic_model_tokyo_olympics_tweets_unclean')
|
49 |
-
return topic_model
|
50 |
-
topic_model_unclean = load_bertopic_model_unclean()
|
51 |
|
52 |
@st.cache(allow_output_mutation=True)
|
53 |
def load_bertopic_model():
|
@@ -61,6 +57,14 @@ def read_topic_results():
|
|
61 |
return topic_results
|
62 |
topic_results = read_topic_results()
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
# write title of app
|
65 |
st.title('DACoP - Survey Analytics')
|
66 |
st.markdown('''---''')
|
@@ -272,14 +276,9 @@ st.write('''
|
|
272 |
''')
|
273 |
st.write('\n')
|
274 |
|
275 |
-
# plot topics using unclean data
|
276 |
-
|
277 |
-
|
278 |
-
subplot_titles=None,
|
279 |
-
n_words=5,
|
280 |
-
top_n_topics=8,
|
281 |
-
height=300
|
282 |
-
)
|
283 |
st.plotly_chart(fig, use_container_width=True)
|
284 |
|
285 |
st.write('''
|
@@ -301,14 +300,9 @@ labelled_topics = [
|
|
301 |
'Vikas Krishan (Indian Boxer)',
|
302 |
]
|
303 |
|
304 |
-
# plot topics using clean data with stopwords removed
|
305 |
-
|
306 |
-
|
307 |
-
subplot_titles=labelled_topics,
|
308 |
-
n_words=5,
|
309 |
-
top_n_topics=8,
|
310 |
-
height=300
|
311 |
-
)
|
312 |
st.plotly_chart(fig, use_container_width=True)
|
313 |
|
314 |
st.write('''
|
@@ -366,9 +360,184 @@ st.markdown('''---''')
|
|
366 |
|
367 |
|
368 |
st.header('Classifiying Text Responses and Sentiment Analysis')
|
369 |
-
st.write('''
|
370 |
With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
|
371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
''')
|
374 |
st.write('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
import matplotlib.pyplot as plt
|
10 |
import seaborn as sns
|
11 |
import plotly.express as px
|
12 |
+
import pickle
|
13 |
|
14 |
# factor analysis
|
15 |
from factor_analyzer import FactorAnalyzer
|
|
|
19 |
|
20 |
# nlp
|
21 |
from bertopic import BERTopic
|
22 |
+
from transformers import pipeline
|
23 |
|
24 |
# custom
|
25 |
import survey_analytics_library as LIB
|
|
|
40 |
data_survey, data_questions = read_survey_data()
|
41 |
|
42 |
@st.cache
|
43 |
+
def read_tokyo_data():
|
44 |
tokyo = pd.read_csv(data_path+'tokyo_olympics_tweets.csv')
|
45 |
return tokyo
|
46 |
+
tokyo = read_tokyo_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
@st.cache(allow_output_mutation=True)
|
49 |
def load_bertopic_model():
|
|
|
57 |
return topic_results
|
58 |
topic_results = read_topic_results()
|
59 |
|
60 |
+
@st.cache
|
61 |
+
def read_climate_change_results():
|
62 |
+
sentiment_results = pd.read_csv(data_path+'sentiment_results.csv')
|
63 |
+
zero_shot_results = pd.read_csv(data_path+'zero_shot_results.csv')
|
64 |
+
return sentiment_results, zero_shot_results
|
65 |
+
sentiment_results, zero_shot_results = read_climate_change_results()
|
66 |
+
|
67 |
+
|
68 |
# write title of app
|
69 |
st.title('DACoP - Survey Analytics')
|
70 |
st.markdown('''---''')
|
|
|
276 |
''')
|
277 |
st.write('\n')
|
278 |
|
279 |
+
# load and plot topics using unclean data
|
280 |
+
with open('data/topics_tokyo_unclean.pickle', 'rb') as pkl:
|
281 |
+
fig = pickle.load(pkl)
|
|
|
|
|
|
|
|
|
|
|
282 |
st.plotly_chart(fig, use_container_width=True)
|
283 |
|
284 |
st.write('''
|
|
|
300 |
'Vikas Krishan (Indian Boxer)',
|
301 |
]
|
302 |
|
303 |
+
# load plot topics using clean data with stopwords removed
|
304 |
+
with open('data/topics_tokyo.pickle', 'rb') as pkl:
|
305 |
+
fig = pickle.load(pkl)
|
|
|
|
|
|
|
|
|
|
|
306 |
st.plotly_chart(fig, use_container_width=True)
|
307 |
|
308 |
st.write('''
|
|
|
360 |
|
361 |
|
362 |
st.header('Classifiying Text Responses and Sentiment Analysis')
|
363 |
+
st.write(f'''
|
364 |
With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
|
365 |
+
An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
|
366 |
+
Using **Zero-shot Classification**, we can classify responses into one of these four categories.
|
367 |
+
As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
|
368 |
+
We'll use a different set of {len(sentiment_results):,} tweets related to climate change.
|
369 |
+
''')
|
370 |
+
st.write('\n')
|
371 |
+
|
372 |
+
# rename column
|
373 |
+
sentiment_results = sentiment_results.rename(columns={'sequence':'Tweet'})
|
374 |
+
st.dataframe(sentiment_results[['Tweet']])
|
375 |
+
|
376 |
+
@st.cache(allow_output_mutation=True)
|
377 |
+
def load_transfomer_pipelines():
|
378 |
+
classifier_zero_shot = pipeline(
|
379 |
+
task='zero-shot-classification',
|
380 |
+
model='valhalla/distilbart-mnli-12-1',
|
381 |
+
return_all_scores=True
|
382 |
+
)
|
383 |
+
classifier_sentiment = pipeline(
|
384 |
+
task='sentiment-analysis',
|
385 |
+
model = 'distilbert-base-uncased-finetuned-sst-2-english',
|
386 |
+
return_all_scores=True
|
387 |
+
)
|
388 |
+
return classifier_zero_shot, classifier_sentiment
|
389 |
+
classifier_zero_shot, classifier_sentiment = load_transfomer_pipelines()
|
390 |
+
|
391 |
+
# define candidate labels
|
392 |
+
candidate_labels = [
|
393 |
+
'finance',
|
394 |
+
'politics',
|
395 |
+
'technology',
|
396 |
+
'wildlife',
|
397 |
+
]
|
398 |
+
|
399 |
+
# define sample tweet
|
400 |
+
sample_tweet_index = 5000
|
401 |
+
|
402 |
+
# define the first and last topic number
|
403 |
+
# create range of index
|
404 |
+
tweet_index = sentiment_results.index
|
405 |
+
first_tweet = tweet_index[0]
|
406 |
+
last_tweet = tweet_index[-1]
|
407 |
+
|
408 |
+
st.write(f'''
|
409 |
+
As a demonstration, we'll define some categories and pick a tweet to classify and determine its sentiment.
|
410 |
+
Feel free to add your own categories or even input your own text!
|
411 |
+
''')
|
412 |
+
|
413 |
+
# interactive input for user to define candidate labels and tweet index for analysis
|
414 |
+
with st.form('classify_tweets'):
|
415 |
+
# input for labels
|
416 |
+
user_defined_labels = st.text_input('Enter categories (separate categories by comma):', ', '.join(candidate_labels))
|
417 |
+
candidate_labels = user_defined_labels
|
418 |
+
# input for tweet index
|
419 |
+
user_define_tweet = st.number_input(f'Enter tweet index (from {first_tweet} to {last_tweet}) to classify:', min_value=first_tweet, max_value=last_tweet, value=sample_tweet_index)
|
420 |
+
sample_tweet_index = user_define_tweet
|
421 |
+
sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
|
422 |
+
# input for user defined text
|
423 |
+
user_defined_input = st.text_input('Enter custom text (optional, leave blank to use Tweets):', '')
|
424 |
+
# check if user has entered any custom text
|
425 |
+
# if user_define_input is not blank, then override sample_tweet
|
426 |
+
if user_defined_input:
|
427 |
+
sample_tweet = user_defined_input
|
428 |
+
|
429 |
+
# submit form
|
430 |
+
submit = st.form_submit_button('Classify Tweet')
|
431 |
+
|
432 |
+
st.write('\n')
|
433 |
+
st.write(f'''
|
434 |
+
Here are the results:
|
435 |
+
''')
|
436 |
+
st.write(f'Input Text: *\'{sample_tweet}\'*')
|
437 |
+
|
438 |
+
# get predictions from models
|
439 |
+
zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
|
440 |
+
sentiment_sample = classifier_sentiment(sample_tweet)
|
441 |
+
|
442 |
+
# get sentiment
|
443 |
+
sentiment_sample = sentiment_sample[1].get('score')
|
444 |
+
sentiment_label = 'positive'
|
445 |
+
if sentiment_sample < 0.5:
|
446 |
+
sentiment_label = 'negative'
|
447 |
+
|
448 |
+
st.write(f'''
|
449 |
+
The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
|
450 |
+
Main category score ranges from 0 to 1, with 1 being very likely.
|
451 |
+
|
452 |
+
The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
|
453 |
+
Full set of scores cores add up to 1.
|
454 |
+
|
455 |
+
The sentiment is: **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
|
456 |
+
Sentiment score ranges from 0 to 1, with 1 being very positive.
|
457 |
+
''')
|
458 |
+
st.write('\n')
|
459 |
+
st.write('\n')
|
460 |
|
461 |
+
# drop unused columns and rename columns
|
462 |
+
zero_shot_results = zero_shot_results.drop('labels_scores', axis=1)
|
463 |
+
zero_shot_results = zero_shot_results.rename(columns={'sequence':'tweet', 'label':'category'})
|
464 |
+
st.write(f'''
|
465 |
+
Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
|
466 |
+
''')
|
467 |
+
|
468 |
+
st.dataframe(zero_shot_results)
|
469 |
+
|
470 |
+
st.write(f'''
|
471 |
+
We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
|
472 |
+
It is likely that the tweet does not natually fall into one of the defined categories.
|
473 |
+
Before performing further analysis on our results, we can set a score threshold to only keep predictions that we're confident in.
|
474 |
+
''')
|
475 |
+
st.write('\n')
|
476 |
+
|
477 |
+
# interactive input for user to define candidate labels and tweet index for analysis
|
478 |
+
with st.form('classification_score_threshold'):
|
479 |
+
user_defined_threshold = st.number_input('Enter score threshold (between 0.01 and 0.99):', min_value=0.01, max_value=0.99, value=0.7, step=0.05)
|
480 |
+
# submit form
|
481 |
+
submit = st.form_submit_button('Set Threshold')
|
482 |
+
st.write('\n')
|
483 |
+
|
484 |
+
# filter and keep results with score above defined threshold
|
485 |
+
zero_shot_results_clean = zero_shot_results.loc[(zero_shot_results['score'] >= user_defined_threshold)].copy()
|
486 |
+
|
487 |
+
# rename columns
|
488 |
+
sentiment_results.columns = ['tweet', 'sentiment']
|
489 |
+
|
490 |
+
st.write(f'''
|
491 |
+
The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
|
492 |
+
Out of the 10,000 tweets, we are now left with {len(zero_shot_results_clean)}.
|
493 |
+
We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
|
494 |
+
''')
|
495 |
+
|
496 |
+
# merge in sentiment score on index
|
497 |
+
# drop unused columns
|
498 |
+
classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
|
499 |
+
classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
|
500 |
+
st.dataframe(classification_sentiment_df)
|
501 |
+
|
502 |
+
st.write(f'''
|
503 |
+
The difficult part for zero-shot classification is defining the right set of categories for each business case.
|
504 |
+
Some trial and error is required to find the appropriate words that can return the optimal results.
|
505 |
''')
|
506 |
st.write('\n')
|
507 |
+
|
508 |
+
# group by category, count tweets and get mean of sentiment
|
509 |
+
classification_sentiment_agg = classification_sentiment_df.groupby(['category']).agg({'tweet':'count', 'sentiment':'mean'}).reset_index()
|
510 |
+
classification_sentiment_agg = classification_sentiment_agg.rename(columns={'tweet':'count'})
|
511 |
+
|
512 |
+
st.write(f'''
|
513 |
+
Finally, we can visualise the percentage of tweets in each category and the respective average sentiment scores.
|
514 |
+
''')
|
515 |
+
|
516 |
+
fig = px.pie(
|
517 |
+
classification_sentiment_agg,
|
518 |
+
values='count',
|
519 |
+
names='category',
|
520 |
+
hole=0.35,
|
521 |
+
title='Percentage of Tweets in Each Category',
|
522 |
+
template='simple_white',
|
523 |
+
width=1000,
|
524 |
+
height=600
|
525 |
+
)
|
526 |
+
fig.update_traces(textposition='inside', textinfo='percent+label')
|
527 |
+
st.plotly_chart(fig)
|
528 |
+
|
529 |
+
fig = px.bar(
|
530 |
+
classification_sentiment_agg,
|
531 |
+
x='category',
|
532 |
+
y='sentiment',
|
533 |
+
title='Average Sentiment of Tweets in Each Category <br><sup>Overall, the sentiment of the tweets are on the negative side.</sup>',
|
534 |
+
template='simple_white',
|
535 |
+
width=1000,
|
536 |
+
height=600
|
537 |
+
)
|
538 |
+
fig.update_yaxes(range=[0, 1])
|
539 |
+
fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
|
540 |
+
st.plotly_chart(fig)
|
541 |
+
|
542 |
+
st.write('\n')
|
543 |
+
st.markdown('''---''')
|
data/climate_change_tweets.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/sentiment_results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/tokyo_topics.csv
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Topic,Count,Name
|
2 |
+
-1,2624,-1_silver_medal_proud_mirabaichanu
|
3 |
+
0,428,0_banda_zambia_barbra_barbra banda
|
4 |
+
1,356,1_india_proud_indians_moment
|
5 |
+
2,296,2_sutirtha_mukherjee_sutirtha mukherjee_tabletennis
|
6 |
+
3,287,3_mirabaichanu hearty_lifting womens_mirabaichanu lifting_hearty congratulations
|
7 |
+
4,248,4_race_road_road race_carapaz
|
8 |
+
5,210,5_japan_volleyball_venezuela_flag
|
9 |
+
6,195,6_kerr_sam_sam kerr_matildas
|
10 |
+
7,183,7_vikas_boxing_krishan_vikas krishan
|
11 |
+
8,163,8_gymnastics_mens gymnastics_max_whitlock
|
12 |
+
9,148,9_tennis_murray_singles_nagal
|
13 |
+
10,121,10_bbc_coverage_live_bbcsport
|
14 |
+
11,120,11_ina_facebook_action_officialvkyadav
|
15 |
+
12,115,12_puneethrajkumar cheer4india_dvitva james_dvitva_james puneethrajkumar
|
16 |
+
13,113,13_hockey_south africa_gbhockey_africa
|
17 |
+
14,100,14_judo_takato_gold_japans
|
18 |
+
15,97,15_chanu_mirabai chanu_chanu wins_mirabai
|
19 |
+
16,92,16_swimming_swimming swimming_aquatics_finals
|
20 |
+
17,89,17_medal weightlifting_mirabaichanu winning_ace indian_congratulations ace
|
21 |
+
18,87,18_q2_canwnt_corner_follow live
|
22 |
+
19,85,19_winning medal_indias medal_medal india_medal
|
23 |
+
20,84,20_basketball_3x3_3x3 basketball_usa
|
24 |
+
21,80,21_butterfly_100m_heat_100m butterfly
|
25 |
+
22,78,22_weightlifter_weightlifter mirabai_chanu_mirabai chanu
|
26 |
+
23,78,23_kosovo_distria_krasniqi_distria krasniqi
|
27 |
+
24,77,24_swevaus_damn_furniture_swevaus fuck
|
28 |
+
25,75,25_yulo_carlos_carlos yulo_rings
|
29 |
+
26,71,26_ceremony_opening ceremony_opening_drones
|
30 |
+
27,69,27_medal ongoing_ongoing_winning indias_indias medal
|
31 |
+
28,64,28_teamgb_gb_come_team gb
|
32 |
+
29,62,29_sweden_swedes_swevaus_swedish
|
33 |
+
30,61,30_sweden_australia_rolfo_fridolina
|
34 |
+
31,60,31_japan_britain_great britain_japan great
|
35 |
+
32,59,32_rule_remedy_remedy rule_butterfly
|
36 |
+
33,55,33_silver medal_winning silver_silver_mirabaichanu winning
|
37 |
+
34,52,34_mirabaichanu proud_proud_proud mirabaichanu_mirabaichanu
|
38 |
+
35,51,35_chile_canada_beckie_janine
|
39 |
+
36,51,36_mediasai_virenrasquinha imrahultrehan_iosindiaoff virenrasquinha_iosindiaoff
|
40 |
+
37,49,37_clareburt_lewis_lewis clareburt_kalisz
|
41 |
+
38,49,38_dressage_equestrian_horse_equestrian dressage
|
42 |
+
39,47,39_mirabaichanu wins_49kg category_category_india snatches
|
43 |
+
40,47,40_imrahultrehan congratulations_mirabaichanu mediasai_railminindia_iosindiaoff
|
44 |
+
41,47,41_silver medal_tally_silver_medals tally
|
45 |
+
42,47,42_penalty_penalty china_ref_referee
|
46 |
+
43,45,43_country proud_medal country_country_winning silver
|
47 |
+
44,45,44_teammalaysia_teamindia_teamina_congrats teammalaysia
|
48 |
+
45,44,45_daddies_badminton_daddies badminton_ina
|
49 |
+
46,44,46_chirag_rankireddy_shetty_chirag shetty
|
50 |
+
47,44,47_countrys medal_bringing glory_glory medal_countrys
|
51 |
+
48,43,48_medals_1001_medals won_1001 1001
|
52 |
+
49,43,49_badminton_badmintonmalaysia_ina_wooi yik
|
53 |
+
50,42,50_achieving medal_mirabaichanu achieving_achieving_medal india
|
54 |
+
51,41,51_badminton_malaysia_double_sokongmalaysia
|
55 |
+
52,41,52_sleep_saturday_hours_watch
|
56 |
+
53,41,53_cheer4india_teamindia_da boys_teamindia best
|
57 |
+
54,40,54_sweaus_sweaus football_swe_aus
|
58 |
+
55,40,55_pistol_10m_air pistol_air
|
59 |
+
56,39,56_medal weightlifting_winning silver_weightlifting_silver medal
|
60 |
+
57,38,57_silver india_silver_india_mirabaichanu silver
|
61 |
+
58,37,58_flying start_flying_start huge_huge congratulations
|
62 |
+
59,36,59_archery_mixed team_korea_mixed
|
63 |
+
60,35,60_covid19_covid_paralympics_test
|
64 |
+
61,35,61_athletes_olympians_proud athletes_congratulations joebrier99
|
65 |
+
62,35,62_penalty_swevaus_penalty swevaus_swevaus penalty
|
66 |
+
63,35,63_pakistan_uae_athletes_afghanistan
|
67 |
+
64,34,64_asked_asked happier_india elated_happier start
|
68 |
+
65,34,65_smith_brendon_brendon smith_swim
|
69 |
+
66,33,66_matildas_sweden_matildas sweden_attacking
|
70 |
+
67,32,67_mirabaichanu cheer4india_cheer4india_cheer4india mirabaichanu_mirabaichanu congratulations
|
71 |
+
68,32,68_day mirabaichanu_indias 1st_medal day_weightlifting india
|
72 |
+
69,31,69_boxing_boxers_welterweights_delante
|
73 |
+
70,31,70_loving_let party_officially held_waiting gymnastics
|
74 |
+
71,31,71_400m_mens 400m_heat_400
|
75 |
+
72,30,72_malaysia_malaysiaboleh_malaysiaboleh congrats_malaysia malaysia
|
76 |
+
73,30,73_time india_india clinches_clinches medal_day hearty
|
77 |
+
74,30,74_silver medal_medal india_silver_india
|
78 |
+
75,30,75_mirabai chanu_mirabai_chanu_saikhom mirabai
|
79 |
+
76,30,76_football_womens football_soccer_women
|
80 |
+
77,30,77_mcgrail_peter mcgrail_peter_butdee
|
81 |
+
78,29,78_display weightlifting_amazing display_absolutely amazing_display
|
82 |
+
79,29,79_cheer4india_medal cheer4india_indias mirabaichanu_medal medal
|
83 |
+
80,29,80_mirabaichanu teamindia_teamindia_teamindia mirabaichanu_proud teamindia
|
84 |
+
81,29,81_spain_waterpolo_water polo_polo
|
85 |
+
82,29,82_daddies_daddies daddies_daddies victory_mantap daddies
|
86 |
+
83,28,83_pen_pen swevaus_swevaus pen_swevaus
|
87 |
+
84,28,84_mirabaichanu mirabaichanu_mirabaichanu_congratulations mirabaichanu_power
|
88 |
+
85,27,85_congratulations mirabai_chanu winning_mirabai chanu_49 kg
|
89 |
+
86,27,86_silver weightlifting_huge congratulations_huge_winning silver
|
90 |
+
87,27,87_qian_yang qian_yang_chinas
|
91 |
+
88,27,88_medal womens_category_49kg_winning silver
|
92 |
+
89,27,89_potential_massive potential_long term_term quick
|
93 |
+
90,26,90_matildas_fark_pen matildas_matildas matildas
|
94 |
+
91,26,91_grande_carapaz_hispanos_grande carapaz
|
95 |
+
92,25,92_gift selflove_mensfashion_selflove_selfie mensfashion
|
96 |
+
93,24,93_matildas_matildas swevaus_swevaus_swevaus matildas
|
97 |
+
94,24,94_49_womens 49_49 kgs_kgs
|
98 |
+
95,24,95_thematildas_goaustralia_thematildas samkerr1_goaustralia thematildas
|
99 |
+
96,23,96_new zealand_zealand_hockey_new
|
100 |
+
97,23,97_chanu secured_secured medal_secured_country winning
|
101 |
+
98,23,98_weightlifting lets_lets cheer_cheer india_cheer
|
102 |
+
99,23,99_raymondcupid kyereminator_kyereminator daterush_watch hisbella4_kyereminator
|
103 |
+
100,22,100_mirabaichanu silver_silver_silver mirabaichanu_mam silver
|
104 |
+
101,22,101_nigeria_ghana_team_ghanas
|
105 |
+
102,22,102_aus_ausvswe_aussies_australia
|
106 |
+
103,22,103_winning silver_mirabaichanu winning_weightlifting medal_silver weightlifting
|
107 |
+
104,22,104_teamindia 49kg_silver medal_mirabaichanu won_medal radiant
|
108 |
+
105,21,105_swimming_bbcsport_bbc_swimming heats
|
109 |
+
106,21,106_mirabaichanu weightlifting_weightlifting_india mirabaichanu_india
|
110 |
+
107,21,107_mirabaichanu weightlifting_weightlifting_spirits_bow
|
111 |
+
108,21,108_history mirabai_teamindia mirabaichanu_chanu won_medal teamindia
|
112 |
+
109,21,109_giochiolimpici_forzaazzurri_olimpiadi forzaazzurri_olimpiadi
|
113 |
+
110,21,110_handball_portugal_egypt_esp
|
114 |
+
111,21,111_seto_daiya_daiya seto_shock
|
115 |
+
112,21,112_congratulations mirabai_chanu winning_chanu_mirabai
|
116 |
+
113,21,113_brazil_netherlands_netherlands brazil_brazil womens
|
117 |
+
114,20,114_mohanlal_mirabaichanu congratulations_mohanlal mirabaichanu_winning indias
|
118 |
+
115,20,115_day congratulations_congratulations saikhom_49kg weightlift_weightlift
|
119 |
+
116,20,116_saikhom_saikhom mirabai_congratulations saikhom_chanu winning
|
120 |
+
117,20,117_dreams_criticism blood_sacrifice_criticism
|
121 |
+
118,20,118_peaty_adam_adam peaty_adampeaty
|
122 |
+
119,19,119_actor_medal winner_favourite actor_winner mirabaichanu
|
123 |
+
120,19,120_peng_ying_chan peng_chan
|
124 |
+
121,19,121_taekwondo_jin_barbosa_kurt
|
125 |
+
122,18,122_fencing_samele fencing_samele_2nd round
|
126 |
+
123,18,123_congratulated winning_mirabaichanu congratulated_congratulated_spoke
|
127 |
+
124,18,124_strikes medal_india strikes_strikes_medal 49
|
128 |
+
125,17,125_mirabaichanu comes_comes india_mohanlal mirabaichanu_mohanlal
|
129 |
+
126,15,126_carrying_moment_proud moment_proud
|
130 |
+
127,15,127_cheer4india_country cheer4india_teamindia_medal teamindia
|
131 |
+
128,15,128_medal mirabai_kg womens_medal 49_womens weightlifting
|
data/topics_tokyo.pickle
ADDED
Binary file (9.44 kB). View file
|
|
data/topics_tokyo_unclean.pickle
ADDED
Binary file (9.34 kB). View file
|
|
data/zero_shot_results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/bertopic_model_tokyo_olympics_tweets_unclean
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0de856ed231c12e7baeaff15eb3159e1a5ef7c5512b459f915f46712f6d203a3
|
3 |
-
size 71961846
|
|
|
|
|
|
|
|
models/distilbart-mnli-12-1/README.md
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
---
|
2 |
-
datasets:
|
3 |
-
- mnli
|
4 |
-
tags:
|
5 |
-
- distilbart
|
6 |
-
- distilbart-mnli
|
7 |
-
pipeline_tag: zero-shot-classification
|
8 |
-
---
|
9 |
-
|
10 |
-
# DistilBart-MNLI
|
11 |
-
|
12 |
-
distilbart-mnli is the distilled version of bart-large-mnli created using the **No Teacher Distillation** technique proposed for BART summarisation by Huggingface, [here](https://github.com/huggingface/transformers/tree/master/examples/seq2seq#distilbart).
|
13 |
-
|
14 |
-
We just copy alternating layers from `bart-large-mnli` and finetune more on the same data.
|
15 |
-
|
16 |
-
|
17 |
-
| | matched acc | mismatched acc |
|
18 |
-
| ------------------------------------------------------------------------------------ | ----------- | -------------- |
|
19 |
-
| [bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli) (baseline, 12-12) | 89.9 | 90.01 |
|
20 |
-
| [distilbart-mnli-12-1](https://huggingface.co/valhalla/distilbart-mnli-12-1) | 87.08 | 87.5 |
|
21 |
-
| [distilbart-mnli-12-3](https://huggingface.co/valhalla/distilbart-mnli-12-3) | 88.1 | 88.19 |
|
22 |
-
| [distilbart-mnli-12-6](https://huggingface.co/valhalla/distilbart-mnli-12-6) | 89.19 | 89.01 |
|
23 |
-
| [distilbart-mnli-12-9](https://huggingface.co/valhalla/distilbart-mnli-12-9) | 89.56 | 89.52 |
|
24 |
-
|
25 |
-
|
26 |
-
This is a very simple and effective technique, as we can see the performance drop is very little.
|
27 |
-
|
28 |
-
Detailed performace trade-offs will be posted in this [sheet](https://docs.google.com/spreadsheets/d/1dQeUvAKpScLuhDV1afaPJRRAE55s2LpIzDVA5xfqxvk/edit?usp=sharing).
|
29 |
-
|
30 |
-
|
31 |
-
## Fine-tuning
|
32 |
-
If you want to train these models yourself, clone the [distillbart-mnli repo](https://github.com/patil-suraj/distillbart-mnli) and follow the steps below
|
33 |
-
|
34 |
-
Clone and install transformers from source
|
35 |
-
```bash
|
36 |
-
git clone https://github.com/huggingface/transformers.git
|
37 |
-
pip install -qqq -U ./transformers
|
38 |
-
```
|
39 |
-
|
40 |
-
Download MNLI data
|
41 |
-
```bash
|
42 |
-
python transformers/utils/download_glue_data.py --data_dir glue_data --tasks MNLI
|
43 |
-
```
|
44 |
-
|
45 |
-
Create student model
|
46 |
-
```bash
|
47 |
-
python create_student.py \
|
48 |
-
--teacher_model_name_or_path facebook/bart-large-mnli \
|
49 |
-
--student_encoder_layers 12 \
|
50 |
-
--student_decoder_layers 6 \
|
51 |
-
--save_path student-bart-mnli-12-6 \
|
52 |
-
```
|
53 |
-
|
54 |
-
Start fine-tuning
|
55 |
-
```bash
|
56 |
-
python run_glue.py args.json
|
57 |
-
```
|
58 |
-
|
59 |
-
You can find the logs of these trained models in this [wandb project](https://wandb.ai/psuraj/distilbart-mnli).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/distilbart-mnli-12-1/config.json
DELETED
@@ -1,56 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_num_labels": 3,
|
3 |
-
"activation_dropout": 0.0,
|
4 |
-
"activation_function": "gelu",
|
5 |
-
"add_bias_logits": false,
|
6 |
-
"add_final_layer_norm": false,
|
7 |
-
"architectures": [
|
8 |
-
"BartForSequenceClassification"
|
9 |
-
],
|
10 |
-
"attention_dropout": 0.1,
|
11 |
-
"bos_token_id": 0,
|
12 |
-
"classif_dropout": 0.0,
|
13 |
-
"classifier_dropout": 0.0,
|
14 |
-
"d_model": 1024,
|
15 |
-
"decoder_attention_heads": 16,
|
16 |
-
"decoder_ffn_dim": 4096,
|
17 |
-
"decoder_layerdrop": 0.0,
|
18 |
-
"decoder_layers": 1,
|
19 |
-
"decoder_start_token_id": 2,
|
20 |
-
"dropout": 0.1,
|
21 |
-
"encoder_attention_heads": 16,
|
22 |
-
"encoder_ffn_dim": 4096,
|
23 |
-
"encoder_layerdrop": 0.0,
|
24 |
-
"encoder_layers": 12,
|
25 |
-
"eos_token_id": 2,
|
26 |
-
"extra_pos_embeddings": 2,
|
27 |
-
"finetuning_task": "mnli",
|
28 |
-
"force_bos_token_to_be_generated": false,
|
29 |
-
"forced_eos_token_id": 2,
|
30 |
-
"gradient_checkpointing": false,
|
31 |
-
"id2label": {
|
32 |
-
"0": "contradiction",
|
33 |
-
"1": "neutral",
|
34 |
-
"2": "entailment"
|
35 |
-
},
|
36 |
-
"init_std": 0.02,
|
37 |
-
"is_encoder_decoder": true,
|
38 |
-
"label2id": {
|
39 |
-
"contradiction": 0,
|
40 |
-
"entailment": 2,
|
41 |
-
"neutral": 1
|
42 |
-
},
|
43 |
-
"max_position_embeddings": 1024,
|
44 |
-
"model_type": "bart",
|
45 |
-
"normalize_before": false,
|
46 |
-
"normalize_embedding": true,
|
47 |
-
"num_hidden_layers": 12,
|
48 |
-
"output_past": false,
|
49 |
-
"pad_token_id": 1,
|
50 |
-
"scale_embedding": false,
|
51 |
-
"static_position_embeddings": false,
|
52 |
-
"total_flos": 153130534133111808,
|
53 |
-
"transformers_version": "4.7.0.dev0",
|
54 |
-
"use_cache": true,
|
55 |
-
"vocab_size": 50265
|
56 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/distilbart-mnli-12-1/merges.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
models/distilbart-mnli-12-1/pytorch_model.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:aa79ff59084a5036b07a9cffeaa1b1b7c1aa5edeb1885416a734c001a09aa046
|
3 |
-
size 890410947
|
|
|
|
|
|
|
|
models/distilbart-mnli-12-1/special_tokens_map.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
|
|
|
|
models/distilbart-mnli-12-1/tokenizer_config.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"model_max_length": 1024}
|
|
|
|
models/distilbart-mnli-12-1/vocab.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
models/distilbert-base-uncased-finetuned-sst-2-english/README.md
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
---
|
2 |
-
language: en
|
3 |
-
license: apache-2.0
|
4 |
-
datasets:
|
5 |
-
- sst-2
|
6 |
-
---
|
7 |
-
|
8 |
-
# DistilBERT base uncased finetuned SST-2
|
9 |
-
|
10 |
-
This model is a fine-tune checkpoint of [DistilBERT-base-uncased](https://huggingface.co/distilbert-base-uncased), fine-tuned on SST-2.
|
11 |
-
This model reaches an accuracy of 91.3 on the dev set (for comparison, Bert bert-base-uncased version reaches an accuracy of 92.7).
|
12 |
-
|
13 |
-
For more details about DistilBERT, we encourage users to check out [this model card](https://huggingface.co/distilbert-base-uncased).
|
14 |
-
|
15 |
-
# Fine-tuning hyper-parameters
|
16 |
-
|
17 |
-
- learning_rate = 1e-5
|
18 |
-
- batch_size = 32
|
19 |
-
- warmup = 600
|
20 |
-
- max_seq_length = 128
|
21 |
-
- num_train_epochs = 3.0
|
22 |
-
|
23 |
-
# Bias
|
24 |
-
|
25 |
-
Based on a few experimentations, we observed that this model could produce biased predictions that target underrepresented populations.
|
26 |
-
|
27 |
-
For instance, for sentences like `This film was filmed in COUNTRY`, this binary classification model will give radically different probabilities for the positive label depending on the country (0.89 if the country is France, but 0.08 if the country is Afghanistan) when nothing in the input indicates such a strong semantic shift. In this [colab](https://colab.research.google.com/gist/ageron/fb2f64fb145b4bc7c49efc97e5f114d3/biasmap.ipynb), [Aurélien Géron](https://twitter.com/aureliengeron) made an interesting map plotting these probabilities for each country.
|
28 |
-
|
29 |
-
<img src="https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/map.jpeg" alt="Map of positive probabilities per country." width="500"/>
|
30 |
-
|
31 |
-
We strongly advise users to thoroughly probe these aspects on their use-cases in order to evaluate the risks of this model. We recommend looking at the following bias evaluation datasets as a place to start: [WinoBias](https://huggingface.co/datasets/wino_bias), [WinoGender](https://huggingface.co/datasets/super_glue), [Stereoset](https://huggingface.co/datasets/stereoset).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/distilbert-base-uncased-finetuned-sst-2-english/config.json
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"activation": "gelu",
|
3 |
-
"architectures": [
|
4 |
-
"DistilBertForSequenceClassification"
|
5 |
-
],
|
6 |
-
"attention_dropout": 0.1,
|
7 |
-
"dim": 768,
|
8 |
-
"dropout": 0.1,
|
9 |
-
"finetuning_task": "sst-2",
|
10 |
-
"hidden_dim": 3072,
|
11 |
-
"id2label": {
|
12 |
-
"0": "NEGATIVE",
|
13 |
-
"1": "POSITIVE"
|
14 |
-
},
|
15 |
-
"initializer_range": 0.02,
|
16 |
-
"label2id": {
|
17 |
-
"NEGATIVE": 0,
|
18 |
-
"POSITIVE": 1
|
19 |
-
},
|
20 |
-
"max_position_embeddings": 512,
|
21 |
-
"model_type": "distilbert",
|
22 |
-
"n_heads": 12,
|
23 |
-
"n_layers": 6,
|
24 |
-
"output_past": true,
|
25 |
-
"pad_token_id": 0,
|
26 |
-
"qa_dropout": 0.1,
|
27 |
-
"seq_classif_dropout": 0.2,
|
28 |
-
"sinusoidal_pos_embds": false,
|
29 |
-
"tie_weights_": true,
|
30 |
-
"vocab_size": 30522
|
31 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/distilbert-base-uncased-finetuned-sst-2-english/map.jpeg
DELETED
Binary file (81.6 kB)
|
|
models/distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:60554cbd7781b09d87f1ececbea8c064b94e49a7f03fd88e8775bfe6cc3d9f88
|
3 |
-
size 267844284
|
|
|
|
|
|
|
|
models/distilbert-base-uncased-finetuned-sst-2-english/tokenizer_config.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"model_max_length": 512, "do_lower_case": true}
|
|
|
|
models/distilbert-base-uncased-finetuned-sst-2-english/vocab.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
survey_analytics_library.py
CHANGED
@@ -18,126 +18,6 @@ from nltk.corpus import stopwords
|
|
18 |
|
19 |
|
20 |
|
21 |
-
# # create elbow plot with kmeans to find optimal number of clusters
|
22 |
-
# def create_elbow_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
|
23 |
-
# '''
|
24 |
-
# create elbow plot with kmeans to find optimal number of clusters based on inertia
|
25 |
-
# where the clusters strikes a balance between being not segmented enough and being too fragmented
|
26 |
-
|
27 |
-
# we look for the point of diminishing returns (also known as the 'elbow') in terms of the inertia,
|
28 |
-
# where inertia is how close the data points are to their respective centers or centroids
|
29 |
-
|
30 |
-
# arguments:
|
31 |
-
# df (df): a dataframe of data to cluster
|
32 |
-
# num_clusters (int): number of clusters to plot
|
33 |
-
# init_method (str): default to 'k-means++', other option is 'random'
|
34 |
-
# n_init (int): default to 10, number of times to run model, cost from the best run will be used
|
35 |
-
# random_state (int): default to 42, random seed used to initialise the model
|
36 |
-
# plot (bool): default to True, option to turn off plots
|
37 |
-
# template (str): default to 'simple_white', change as desired
|
38 |
-
# save (bool): default to False, if True save plot as .html file
|
39 |
-
|
40 |
-
# returns:
|
41 |
-
# a list of inertia for each run
|
42 |
-
# '''
|
43 |
-
|
44 |
-
# # create empty list to store inertia for each run
|
45 |
-
# inertia = []
|
46 |
-
# # define range of clusters to try
|
47 |
-
# k = range(2, num_clusters+1)
|
48 |
-
|
49 |
-
# # loop through number of clusters
|
50 |
-
# for num_clusters in tqdm(k):
|
51 |
-
# # define model
|
52 |
-
# kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
|
53 |
-
# # fit and predict data
|
54 |
-
# kmeans.fit_predict(df)
|
55 |
-
# # get predicted labels
|
56 |
-
# predicted_labels = kmeans.labels_
|
57 |
-
# # append score to list of scores
|
58 |
-
# inertia.append(kmeans.inertia_)
|
59 |
-
|
60 |
-
# # plot elbow plot
|
61 |
-
# if plot:
|
62 |
-
# fig = px.line(
|
63 |
-
# pd.DataFrame({'num_clusters':list(k), 'inertia':inertia}),
|
64 |
-
# x='num_clusters',
|
65 |
-
# y='inertia',
|
66 |
-
# title='Elbow Plot for Optimal Number of Clusters with '+init_method,
|
67 |
-
# markers=True,
|
68 |
-
# template=template,
|
69 |
-
# width=800,
|
70 |
-
# height=500,
|
71 |
-
# )
|
72 |
-
# st.plotly_chart(fig, use_container_width=True)
|
73 |
-
# if save:
|
74 |
-
# fig.write_html('Elbow Plot for Optimal Number of Clusters with '+init_method+'.html')
|
75 |
-
|
76 |
-
# # return
|
77 |
-
# return inertia
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
# # create plot of silhouette scores with sklearn model to find optimal number of clusters
|
82 |
-
# def silhouette_score_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
|
83 |
-
# '''
|
84 |
-
# create plot of silhouette score with kmeans to find optimal number of clusters
|
85 |
-
# where the clusters strikes a balance between being not segmented enough and being too fragmented
|
86 |
-
# the closer the score is to 1, the more easily distinguishable are the clusters from each other
|
87 |
-
|
88 |
-
# arguments:
|
89 |
-
# df (df): a dataframe of data to cluster
|
90 |
-
# num_clusters (int): number of clusters to plot
|
91 |
-
# init_method (str): default to 'k-means++', other option is 'random'
|
92 |
-
# n_init (int): default to 10, number of times to run model, cost from the best run will be used
|
93 |
-
# random_state (int): default to 42, random seed used to initialise the model
|
94 |
-
# plot (bool): default to True, option to turn off plots
|
95 |
-
# template (str): default to 'simple_white', change as desired
|
96 |
-
# save (bool): default to False, if True save plot as .html file
|
97 |
-
|
98 |
-
# returns:
|
99 |
-
# a list of silhouette scores for each run
|
100 |
-
# '''
|
101 |
-
|
102 |
-
# # create empty list to store silhoutte scores for each run
|
103 |
-
# silhouette_scores = []
|
104 |
-
# # define range of clusters to try
|
105 |
-
# k = range(2, num_clusters+1)
|
106 |
-
|
107 |
-
# # loop through number of clusters
|
108 |
-
# for num_clusters in tqdm(k):
|
109 |
-
# # define model
|
110 |
-
# kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
|
111 |
-
# # fit and predict data
|
112 |
-
# kmeans.fit_predict(df)
|
113 |
-
# # get predicted labels
|
114 |
-
# predicted_labels = kmeans.labels_
|
115 |
-
# # get silhoutte score
|
116 |
-
# score = silhouette_score(df, predicted_labels)
|
117 |
-
# # append score to list of scores
|
118 |
-
# silhouette_scores.append(score)
|
119 |
-
|
120 |
-
# # plot silhouette scores
|
121 |
-
# if plot:
|
122 |
-
# fig = px.line(
|
123 |
-
# pd.DataFrame({'num_clusters':list(k), 'silhouette_scores':silhouette_scores}),
|
124 |
-
# x='num_clusters',
|
125 |
-
# y='silhouette_scores',
|
126 |
-
# title='Silhouette Scores for Optimal Number of Clusters with '+init_method,
|
127 |
-
# markers=True,
|
128 |
-
# template=template,
|
129 |
-
# width=800,
|
130 |
-
# height=500,
|
131 |
-
# )
|
132 |
-
# st.plotly_chart(fig, use_container_width=True)
|
133 |
-
# if save:
|
134 |
-
# fig.write_html('Silhouette Scores for Optimal Number of Clusters with '+init_method+'.html')
|
135 |
-
|
136 |
-
# # return
|
137 |
-
# return silhouette_scores
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
# replace text with multiple replacements
|
142 |
def replace_text(string, dict_of_replacements):
|
143 |
'''
|
@@ -379,5 +259,41 @@ def convert_zero_shot_classification_output_to_dataframe(model_output):
|
|
379 |
# drop unused columns
|
380 |
results = results.drop(['labels', 'scores'], axis=1)
|
381 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
# return
|
383 |
return results
|
|
|
18 |
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# replace text with multiple replacements
|
22 |
def replace_text(string, dict_of_replacements):
|
23 |
'''
|
|
|
259 |
# drop unused columns
|
260 |
results = results.drop(['labels', 'scores'], axis=1)
|
261 |
|
262 |
+
# return
|
263 |
+
return results
|
264 |
+
|
265 |
+
|
266 |
+
# convert transformer model sentiment classification prediction into dataframe
|
267 |
+
def convert_sentiment_classification_output_to_dataframe(text_input, model_output):
|
268 |
+
'''
|
269 |
+
convert sentiment classification output into a dataframe
|
270 |
+
|
271 |
+
the model used distilbert-base-uncased-finetuned-sst-2-english outputs a list of lists with two dictionaries,
|
272 |
+
within each dictionary is a label negative or postive and the respective score
|
273 |
+
[
|
274 |
+
[
|
275 |
+
{'label': 'NEGATIVE', 'score': 0.18449656665325165},
|
276 |
+
{'label': 'POSITIVE', 'score': 0.8155034780502319}
|
277 |
+
],
|
278 |
+
...
|
279 |
+
]
|
280 |
+
the scores sum up to 1, and we extract only the positive score in this function,
|
281 |
+
append the scores to the model's input and return a dataframe
|
282 |
+
|
283 |
+
arguments:
|
284 |
+
text_input (list): a list of sequences that is input for the model
|
285 |
+
model_output (list): a list of labels and scores
|
286 |
+
|
287 |
+
return:
|
288 |
+
a dataframe of sequences and sentiment score
|
289 |
+
|
290 |
+
'''
|
291 |
+
# store model positive scores as dataframe
|
292 |
+
results = pd.DataFrame(model_output)[[1]]
|
293 |
+
# get score from column
|
294 |
+
results = results[1].apply(lambda x: x.get('score'))
|
295 |
+
# store input sequences and scores as dataframe
|
296 |
+
results = pd.DataFrame({'sequence':text_input, 'score':results})
|
297 |
+
|
298 |
# return
|
299 |
return results
|