mmahesh873 commited on
Commit
8316f1a
·
1 Parent(s): bad3784

improved descriptions

Browse files
Files changed (1) hide show
  1. app.py +19 -11
app.py CHANGED
@@ -22,8 +22,8 @@ st.write(other_info_dict['data_description'])
22
 
23
 
24
  # %%
25
- st.header("Prompt")
26
- st.write("For each data point in the evaluation dataset, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
27
 
28
 
29
  prompt_options_dict = {
@@ -36,7 +36,7 @@ prompt_options_dict = {
36
  prompt_options_list = list(prompt_options_dict.keys())
37
 
38
  options = st.multiselect(
39
- 'Select prompts:',
40
  prompt_options_list,
41
  [prompt_options_list[0]])
42
 
@@ -110,7 +110,7 @@ st.write("In the case that the LLM answers <NO ANSWER>, the output is set to an
110
  # %%
111
 
112
  st.header('Performance metric')
113
- st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
114
  with st.container():
115
 
116
  overall_performance_list = []
@@ -129,16 +129,22 @@ with st.container():
129
 
130
  if len(options) > 1:
131
  st.subheader('Statistics of performance metrics across selected prompts')
132
- st.write('Mean ', perf_dict['mean'])
133
- st.write('Standard deviation: ', perf_dict['std'])
134
- st.write('Minimum ', perf_dict['min'])
135
- st.write('Maximum ', perf_dict['max'])
 
 
 
 
 
 
136
 
137
 
138
 
139
  # %%
140
  st.header("Bias ratios")
141
- st.write('Bias ratio is defined as the ratio of the lowest performance to the highest performance among categories that have sufficient data (with more than 50 data points) for a characteristic. The following table shows the bias ratio for each of the considered characteristic.')
142
 
143
  processed_t_opt_dict = {}
144
  ch_df = None
@@ -192,7 +198,9 @@ t_ch_df = pd.DataFrame({
192
  })
193
  t_ch_df.index = ch_df.index
194
  if len(options) > 1:
 
195
  with st.container():
 
196
  st.dataframe(t_ch_df)
197
 
198
 
@@ -214,7 +222,7 @@ for t_opt in options:
214
  st.write('Prompt used : ', t_opt)
215
  t_pert_df_global = result_processor_obj_dict[t_opt].get_global_perturbers_df()
216
  t_pert_df_global['Prompt option'] = processed_t_opt_dict[t_opt]
217
- t_pert_df_global['Prompt'] = t_pert_df_global['Perturbation family'] + ' - ' + processed_t_opt_dict[t_opt]
218
 
219
  t_pert_fig = px.line(t_pert_df_global, x="Levels", y="Performance", color='Perturbation family')
220
  t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
@@ -229,7 +237,7 @@ t_pert_df_global_merged_df_2 = t_pert_df_global_merged_df[['Prompt option', 'Per
229
  t_pert_df_global_merged_df_2.set_index(t_pert_df_global_merged_df_2.columns[0])
230
  # st.dataframe(t_pert_df_global_merged_df_2)
231
 
232
- temp_merged_fig = px.line(t_pert_df_global_merged_df, x="Levels", y="Performance", color='Prompt')
233
  temp_merged_fig.update_xaxes(tickmode='linear', dtick=1)
234
  st.plotly_chart(temp_merged_fig, theme="streamlit", use_container_width=True)
235
 
 
22
 
23
 
24
  # %%
25
+ st.header("Prompts")
26
+ st.write("For each data point in the evaluation dataset and every prompt template described below, we create a prompt for LLM by adding the context and the question to the below prompt template, while following the same structure of the prompt template.")
27
 
28
 
29
  prompt_options_dict = {
 
36
  prompt_options_list = list(prompt_options_dict.keys())
37
 
38
  options = st.multiselect(
39
+ 'Select one or more prompts:',
40
  prompt_options_list,
41
  [prompt_options_list[0]])
42
 
 
110
  # %%
111
 
112
  st.header('Performance metric')
113
+ st.write("""The performance metric used is an estimation of the percentage of correctly answered questions, i.e. where correctly answered questions mean that the output of the model coincides with one of the ground truth answers. The performance metric can also be interpreted as the probability that the model correctly answers a question. The performance of the model is evaluated with the exact match accuracy metric (see compute_exact function in SQuAD2.0 official evaluation script [here](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)), taking values in [0,1], where 0 is worst (model always wrong), and 1 is best (model always correct). It is the number of correctly answered questions divided by the number of data points. An answer is considered to be correctly answered (by the model), if the predicted answer after normalization (text is converted to lowercase, and punctuation, articles and extra whitespace are removed) matches exactly with any of the normalized ground truth answers. In the case of unanswerable questions, the empty string is considered to be the only ground truth answer.""")
114
  with st.container():
115
 
116
  overall_performance_list = []
 
129
 
130
  if len(options) > 1:
131
  st.subheader('Statistics of performance metrics across selected prompts')
132
+ st.write('The statistics of overall performance across selected prompts is provided below.')
133
+ temp_stat_df = pd.DataFrame({
134
+ 'Statistic': ['Mean', 'Standard deviation', 'Minimum', 'Maximum'],
135
+ 'Value': [perf_dict['mean'], perf_dict['std'], perf_dict['min'], perf_dict['max']]
136
+ })
137
+ st.dataframe(temp_stat_df.set_index(temp_stat_df.columns[0]))
138
+ # st.write('Mean ', perf_dict['mean'])
139
+ # st.write('Standard deviation: ', perf_dict['std'])
140
+ # st.write('Minimum ', perf_dict['min'])
141
+ # st.write('Maximum ', perf_dict['max'])
142
 
143
 
144
 
145
  # %%
146
  st.header("Bias ratios")
147
+ st.write('Bias ratio is defined as the ratio of the lowest performance to the highest performance among categories that have sufficient data (with more than 50 data points) for a characteristic. The following table shows the bias ratio for each of the considered characteristics.')
148
 
149
  processed_t_opt_dict = {}
150
  ch_df = None
 
198
  })
199
  t_ch_df.index = ch_df.index
200
  if len(options) > 1:
201
+
202
  with st.container():
203
+ st.write('For each characteristic, the statistics of bias ratios across selected prompt options is provided below.')
204
  st.dataframe(t_ch_df)
205
 
206
 
 
222
  st.write('Prompt used : ', t_opt)
223
  t_pert_df_global = result_processor_obj_dict[t_opt].get_global_perturbers_df()
224
  t_pert_df_global['Prompt option'] = processed_t_opt_dict[t_opt]
225
+ t_pert_df_global['Perturbation family - Prompt'] = t_pert_df_global['Perturbation family'] + ' - ' + processed_t_opt_dict[t_opt]
226
 
227
  t_pert_fig = px.line(t_pert_df_global, x="Levels", y="Performance", color='Perturbation family')
228
  t_pert_fig.update_xaxes(tickmode='linear', dtick=1)
 
237
  t_pert_df_global_merged_df_2.set_index(t_pert_df_global_merged_df_2.columns[0])
238
  # st.dataframe(t_pert_df_global_merged_df_2)
239
 
240
+ temp_merged_fig = px.line(t_pert_df_global_merged_df, x="Levels", y="Performance", color='Perturbation family - Prompt')
241
  temp_merged_fig.update_xaxes(tickmode='linear', dtick=1)
242
  st.plotly_chart(temp_merged_fig, theme="streamlit", use_container_width=True)
243