Spaces:

greco
/

survey_analytics_spaces

Runtime error

App Files Files Community

greco commited on Jul 19, 2022

Commit

936e064

1 Parent(s): c6af1a7

update explainations

Browse files

Files changed (1) hide show

app.py +44 -23

app.py CHANGED Viewed

@@ -22,7 +22,10 @@ from transformers import pipeline
 # custom
 import survey_analytics_library as LIB
-# st.set_page_config(layout='wide')
 # define data file path
 data_path = 'data' + os.sep
@@ -85,7 +88,7 @@ df_factor_analysis = data_survey.copy()
 st.subheader('Sample Survey Data')
 st.write('''
     Here we have a sample survey dataset where responders answer questions about their personality traits on a scale from 1 (Very Inaccurate) to 6 (Very Accurate).
-    Factor Analysis gives us \'factors\' or groups of responders into groups can provide us insights about the different personalities of the responders.
     ''')
 # split page into two columns
@@ -103,14 +106,20 @@ st.write('\n')
 st.subheader('Factor Analysis Suitability')
 st.write('''
     Before performing Factor Analysis on the data, we need to evaluate if it is suitable to do so.
-    We apply two statistical tests (Bartlett's and KMO test) the data.
     ''')
 # interactive button to run statistical test to determine suitability for factor analysis
 if st.button('Run Tests'):
-    # test with the null hypothesis that the correlation matrix is an identity matrix
     bartlett_sphericity_stat, p_value = calculate_bartlett_sphericity(x=df_factor_analysis)
-    # test how predictable of a variable by others
     kmo_per_variable, kmo_total = calculate_kmo(x=df_factor_analysis)
     # print test results
     st.write(f'''
@@ -143,8 +152,9 @@ scree_df = pd.DataFrame({'Eigenvalues':eigenvalues, 'Number of Factors':list(ran
 st.subheader('Number of Clusters?')
 st.write(f'''
     How many clusters or factors are appropriate for our data?
-    For Factor Analysis, we can determine the number of factors using the Kaiser criterion and a Scree Plot.
-    We should include factors with an Eigenvalue of at least 1.0.
     ''')
 # plot scree plot
@@ -194,7 +204,7 @@ responder_factors['cluster'] = responder_factors.apply(lambda s: s.argmax(), axi
 # define list of factor columns
 list_of_factor_cols = [col for col in responder_factors.columns if 'factor_' in col]
-st.subheader('Fator Analysis Results')
 st.write('''
     Factor analysis gives us a loading for every factor for each responder.
     We assign each responder to a factor or cluster based on their maximum loading across all the factors.
@@ -218,7 +228,7 @@ fa_z_scores = fa_z_scores.apply(lambda x: round(x, 2))
 st.write('''
     Aggregating the scores of the clusters gives us detail insights to the personality traits of the responders.
-    The scores here have been normalised to Z-scores, a measure of how many standard deviations (SD) is the score away from the mean.
     E.g. A Z-score of 0 indicates the score is identical to the mean, while a Z-score of 1 indicates the score is 1 SD away from the mean.
     ''')
 # define colour map for highlighting cells
@@ -308,7 +318,7 @@ st.plotly_chart(fig, use_container_width=True)
 st.write('''
     Now we can see that the topics have improved.
-    We can make use of the top words in each topic to come up with a meaningful name.
     ''')
 st.write('\n')
 st.write('\n')
@@ -368,7 +378,6 @@ st.write(f'''
     An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
     Using **Zero-shot Classification**, we can classify responses into one of these four categories.
     As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
-    We'll use a different set of {len(sentiment_results):,} tweets related to climate change.
     ''')
 st.write('\n')
@@ -429,20 +438,24 @@ with st.form('classify_tweets'):
     sample_tweet_index = user_define_tweet
     sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
     # input for user defined text
-    user_defined_input = st.text_input('Enter custom text (optional, leave blank to use Tweets):', '')
     # check if user has entered any custom text
     # if user_define_input is not blank, then override sample_tweet
     if user_defined_input:
         sample_tweet = user_defined_input
     # submit form
-    submit = st.form_submit_button('Classify Tweet')
 st.write('\n')
 st.write(f'''
     Here are the results:
     ''')
-st.write(f'Input Text: *\'{sample_tweet}\'*')
 # get predictions from models
 zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
@@ -454,6 +467,10 @@ sentiment_label = 'positive'
 if sentiment_sample < 0.5:
     sentiment_label = 'negative'
 st.write(f'''
     The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
     Main category score ranges from 0 to 1, with 1 being very likely.
@@ -461,7 +478,7 @@ st.write(f'''
     The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
     Full set of scores cores add up to 1.
-    The sentiment is: **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
     Sentiment score ranges from 0 to 1, with 1 being very positive.
     ''')
 st.write('\n')
@@ -476,7 +493,7 @@ st.write(f'''
     Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
     ''')
-st.dataframe(zero_shot_results)
 st.write(f'''
     We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
@@ -487,7 +504,7 @@ st.write('\n')
 # interactive input for user to define candidate labels and tweet index for analysis
 with st.form('classification_score_threshold'):
-    user_defined_threshold = st.number_input('Enter score threshold (between 0.01 and 0.99):', min_value=0.01, max_value=0.99, value=0.7, step=0.05)
     # submit form
     submit = st.form_submit_button('Set Threshold')
 st.write('\n')
@@ -500,7 +517,7 @@ sentiment_results.columns = ['tweet', 'sentiment']
 st.write(f'''
     The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
-    Out of the 10,000 tweets, we are now left with {len(zero_shot_results_clean)}.
     We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
     ''')
@@ -508,7 +525,11 @@ st.write(f'''
 # drop unused columns
 classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
 classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
-st.dataframe(classification_sentiment_df)
 st.write(f'''
     The difficult part for zero-shot classification is defining the right set of categories for each business case.
@@ -535,20 +556,20 @@ fig = px.pie(
     height=600
 )
 fig.update_traces(textposition='inside', textinfo='percent+label')
-st.plotly_chart(fig)
 fig = px.bar(
     classification_sentiment_agg,
     x='category',
     y='sentiment',
-    title='Average Sentiment of Tweets in Each Category <br><sup>Overall, the sentiment of the tweets are on the negative side.</sup>',
     template='simple_white',
     width=1000,
     height=600
 )
 fig.update_yaxes(range=[0, 1])
 fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
-st.plotly_chart(fig)
 st.write('\n')
-st.markdown('''---''')

 # custom
 import survey_analytics_library as LIB
+st.set_page_config(
+     page_title='Survey Analytics',
+     layout='centered',
+    )
 # define data file path
 data_path = 'data' + os.sep
 st.subheader('Sample Survey Data')
 st.write('''
     Here we have a sample survey dataset where responders answer questions about their personality traits on a scale from 1 (Very Inaccurate) to 6 (Very Accurate).
+    Factor Analysis gives us \'factors\' or clusters of responders which provide us insights about the different personalities of the responders.
     ''')
 # split page into two columns
 st.subheader('Factor Analysis Suitability')
 st.write('''
     Before performing Factor Analysis on the data, we need to evaluate if it is suitable to do so.
+    We apply two statistical tests (Bartlett's and KMO test) the data.
+    These two tests check if the variables in the data are correlated with each other.
+    If there isn't any correlation between the variables, then the data is unsuitable for factor analysis as there are no natural clusters.
     ''')
 # interactive button to run statistical test to determine suitability for factor analysis
 if st.button('Run Tests'):
+    # test if the data is an identity matrix
+    # an identify matrix is when the variables in the data are uncorrelated to other variables
+    # this means that the data is unsuitable for factor analysis as there are no natural clusters
     bartlett_sphericity_stat, p_value = calculate_bartlett_sphericity(x=df_factor_analysis)
+    # test how predictable is a variable by variables in the data
+    # if variables are unpredictable or uncorrelated
+    # this means that the data is unsuitable for factor analysis as there are no natural clusters
     kmo_per_variable, kmo_total = calculate_kmo(x=df_factor_analysis)
     # print test results
     st.write(f'''
 st.subheader('Number of Clusters?')
 st.write(f'''
     How many clusters or factors are appropriate for our data?
+    For Factor Analysis, we can determine the number of factors using the eigenvalues and a scree plot.
+    E.g. A factor with an eigenvalue of 5 means that we can represent 5 variables from the data with just 1 factor.
+    The Kaiser criterion suggests that we should include factors with an eigenvalue of at least 1, so the factors included should at least represent 1 variable.
     ''')
 # plot scree plot
 # define list of factor columns
 list_of_factor_cols = [col for col in responder_factors.columns if 'factor_' in col]
+st.subheader('Factor Analysis Results')
 st.write('''
     Factor analysis gives us a loading for every factor for each responder.
     We assign each responder to a factor or cluster based on their maximum loading across all the factors.
 st.write('''
     Aggregating the scores of the clusters gives us detail insights to the personality traits of the responders.
+    The scores here have been normalised to Z-scores, which is a measure of how many standard deviations (SD) is the score away from the mean.
     E.g. A Z-score of 0 indicates the score is identical to the mean, while a Z-score of 1 indicates the score is 1 SD away from the mean.
     ''')
 # define colour map for highlighting cells
 st.write('''
     Now we can see that the topics have improved.
+    We can make use of the top words in each topic to come up with a meaningful name, this has to be done manually and is subjective.
     ''')
 st.write('\n')
 st.write('\n')
     An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
     Using **Zero-shot Classification**, we can classify responses into one of these four categories.
     As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
     ''')
 st.write('\n')
     sample_tweet_index = user_define_tweet
     sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
     # input for user defined text
+    user_defined_input = st.text_input('Enter custom text (optional, leave blank to use tweets):', '')
     # check if user has entered any custom text
     # if user_define_input is not blank, then override sample_tweet
     if user_defined_input:
         sample_tweet = user_defined_input
     # submit form
+    submit = st.form_submit_button('Classify Text')
 st.write('\n')
 st.write(f'''
     Here are the results:
     ''')
+if user_defined_input:
+    st.write(f'Custom Text: *\'{sample_tweet}\'*')
+else:
+    st.write(f'Selected Tweet: *\'{sample_tweet}\'*')
 # get predictions from models
 zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
 if sentiment_sample < 0.5:
     sentiment_label = 'negative'
+emoji = {
+    'positive':'😀',
+    'negative':'☹️',
+}
 st.write(f'''
     The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
     Main category score ranges from 0 to 1, with 1 being very likely.
     The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
     Full set of scores cores add up to 1.
+    The sentiment is: {emoji[sentiment_label]} **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
     Sentiment score ranges from 0 to 1, with 1 being very positive.
     ''')
 st.write('\n')
     Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
     ''')
+st.dataframe(zero_shot_results.style.highlight_max(axis=1, subset=['finance', 'politics', 'technology', 'wildlife'], props='color:white; background-color:green;').format(precision=2))
 st.write(f'''
     We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
 # interactive input for user to define candidate labels and tweet index for analysis
 with st.form('classification_score_threshold'):
+    user_defined_threshold = st.number_input('Enter score threshold (between 0 and 1):', min_value=0.0, max_value=1.0, value=0.7, step=0.05)
     # submit form
     submit = st.form_submit_button('Set Threshold')
 st.write('\n')
 st.write(f'''
     The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
+    Out of the {len(sentiment_results):,} tweets, we are now left with {len(zero_shot_results_clean)}.
     We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
     ''')
 # drop unused columns
 classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
 classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
+def highlight_sentiment(value):
+    color = 'green' if value >= 0.5 else 'red'
+    return 'color:{}'.format(color)
+st.dataframe(classification_sentiment_df.style.applymap(highlight_sentiment, subset=['sentiment']).format(precision=2))
 st.write(f'''
     The difficult part for zero-shot classification is defining the right set of categories for each business case.
     height=600
 )
 fig.update_traces(textposition='inside', textinfo='percent+label')
+st.plotly_chart(fig, use_container_width=True)
 fig = px.bar(
     classification_sentiment_agg,
     x='category',
     y='sentiment',
+    title='Average Sentiment of Tweets in Each Category',
     template='simple_white',
     width=1000,
     height=600
 )
 fig.update_yaxes(range=[0, 1])
 fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
+st.plotly_chart(fig, use_container_width=True)
 st.write('\n')
+st.markdown('''---''')