Spaces:

quantpi
/

llm-assessments

Running

App Files Files Community

mmahesh873 commited on Apr 9, 2024

Commit

8316f1a

1 Parent(s): bad3784

improved descriptions

Browse files

Files changed (1) hide show

app.py +19 -11

app.py CHANGED Viewed

@@ -22,8 +22,8 @@ st.write(other_info_dict['data_description'])
 # %%
-st.header("Prompt")
-st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
 prompt_options_dict = {
@@ -36,7 +36,7 @@ prompt_options_dict = {
 prompt_options_list = list(prompt_options_dict.keys())
 options = st.multiselect(
-    'Select prompts:',
     prompt_options_list,
     [prompt_options_list[0]])
@@ -110,7 +110,7 @@ st.write("In the case that the LLM answers <NO ANSWER>, the output is set to an
 # %%
 st.header('Performance metric')
-st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
 with st.container():
     overall_performance_list = []
@@ -129,16 +129,22 @@ with st.container():
     if len(options) > 1:
         st.subheader('Statistics of performance metrics across selected prompts')
-        st.write('Mean ', perf_dict['mean'])
-        st.write('Standard deviation: ', perf_dict['std'])
-        st.write('Minimum ', perf_dict['min'])
-        st.write('Maximum ', perf_dict['max'])
 # %%
 st.header("Bias ratios")
-st.write('Bias ratio is defined as the ratio of the lowest performance to the highest performance among categories that have sufficient data (with more than 50 data points) for a characteristic. The following table shows the bias ratio for each of the considered characteristic.')
 processed_t_opt_dict = {}
 ch_df = None
@@ -192,7 +198,9 @@ t_ch_df = pd.DataFrame({
 })
 t_ch_df.index = ch_df.index
 if len(options) > 1:
     with st.container():
         st.dataframe(t_ch_df)
@@ -214,7 +222,7 @@ for t_opt in options:
     st.write('Prompt used : ', t_opt)
     t_pert_df_global = result_processor_obj_dict[t_opt].get_global_perturbers_df()
     t_pert_df_global['Prompt option'] = processed_t_opt_dict[t_opt]
-    t_pert_df_global['Prompt'] = t_pert_df_global['Perturbation family'] + ' - ' + processed_t_opt_dict[t_opt]
     t_pert_fig = px.line(t_pert_df_global, x="Levels", y="Performance", color='Perturbation family')
     t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
@@ -229,7 +237,7 @@ t_pert_df_global_merged_df_2 = t_pert_df_global_merged_df[['Prompt option', 'Per
 t_pert_df_global_merged_df_2.set_index(t_pert_df_global_merged_df_2.columns[0])
 # st.dataframe(t_pert_df_global_merged_df_2)
-temp_merged_fig = px.line(t_pert_df_global_merged_df, x="Levels", y="Performance", color='Prompt')
 temp_merged_fig.update_xaxes(tickmode='linear', dtick=1)
 st.plotly_chart(temp_merged_fig, theme="streamlit", use_container_width=True)

 # %%
+st.header("Prompts")
+st.write("For each data point in the evaluation dataset and every prompt template described below, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
 prompt_options_dict = {
 prompt_options_list = list(prompt_options_dict.keys())
 options = st.multiselect(
+    'Select one or more prompts:',
     prompt_options_list,
     [prompt_options_list[0]])
 # %%
 st.header('Performance metric')
+st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. where correctly answered questions mean that the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
 with st.container():
     overall_performance_list = []
     if len(options) > 1:
         st.subheader('Statistics of performance metrics across selected prompts')
+        st.write('The statistics of overall performance across selected prompts is provided below.')
+        temp_stat_df = pd.DataFrame({
+            'Statistic': ['Mean', 'Standard deviation', 'Minimum', 'Maximum'],
+            'Value': [perf_dict['mean'], perf_dict['std'], perf_dict['min'], perf_dict['max']]
+        })
+        st.dataframe(temp_stat_df.set_index(temp_stat_df.columns[0]))
+        # st.write('Mean ', perf_dict['mean'])
+        # st.write('Standard deviation: ', perf_dict['std'])
+        # st.write('Minimum ', perf_dict['min'])
+        # st.write('Maximum ', perf_dict['max'])
 # %%
 st.header("Bias ratios")
+st.write('Bias ratio is defined as the ratio of the lowest performance to the highest performance among categories that have sufficient data (with more than 50 data points) for a characteristic. The following table shows the bias ratio for each of the considered characteristics.')
 processed_t_opt_dict = {}
 ch_df = None
 })
 t_ch_df.index = ch_df.index
 if len(options) > 1:
     with st.container():
+        st.write('For each characteristic, the statistics of bias ratios across selected prompt options is provided below.')
         st.dataframe(t_ch_df)
     st.write('Prompt used : ', t_opt)
     t_pert_df_global = result_processor_obj_dict[t_opt].get_global_perturbers_df()
     t_pert_df_global['Prompt option'] = processed_t_opt_dict[t_opt]
+    t_pert_df_global['Perturbation family - Prompt'] = t_pert_df_global['Perturbation family'] + ' - ' + processed_t_opt_dict[t_opt]
     t_pert_fig = px.line(t_pert_df_global, x="Levels", y="Performance", color='Perturbation family')
     t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
 t_pert_df_global_merged_df_2.set_index(t_pert_df_global_merged_df_2.columns[0])
 # st.dataframe(t_pert_df_global_merged_df_2)
+temp_merged_fig = px.line(t_pert_df_global_merged_df, x="Levels", y="Performance", color='Perturbation family - Prompt')
 temp_merged_fig.update_xaxes(tickmode='linear', dtick=1)
 st.plotly_chart(temp_merged_fig, theme="streamlit", use_container_width=True)