Yotam-Perlitz commited on
Commit
40b9d90
β€’
1 Parent(s): 298500e

revising app

Browse files

Signed-off-by: Yotam-Perlitz <[email protected]>

Files changed (1) hide show
  1. app.py +264 -85
app.py CHANGED
@@ -6,59 +6,226 @@ import plotly.express as px
6
  import streamlit as st
7
  from bat import Benchmark, Config, Reporter, Tester
8
 
9
- holistic_scenarios = [
10
- "arena_hard",
11
- "mixeval",
12
- "agieval",
13
- "arc_c",
14
- "alpacav1",
15
- "alpacav2",
16
- "alpacaeval2_lc",
17
- "arena_elo",
18
- "bbh",
19
- "eq_benchv2",
20
- "gpt4all",
21
- "hugging_6",
22
- "llmonitor",
23
- "magi",
24
- "mmlu",
25
- "mt_bench",
26
- "biggen_mwr",
27
- "olmes_average",
28
- "mmlu_pro",
29
- ]
30
-
31
 
32
  def get_nice_benchmark_name(bench_name):
33
- benchmarks_dict = {
34
- "arena_elo": "LMSys Arena",
35
- "mt_bench": "MT Bench",
36
- "mixeval": "Mix Eval",
37
- "alpacav2": "AlpacaEval V2",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  "arena_hard": "Arena Hard",
39
- "arc_c": "ARC-C",
40
- "eq_benchv2": "EQ Bench V2",
41
- "agieval": "AGIEval",
42
- "llmonitor": "LLMonitor",
43
- "bbh": "BBH",
44
- "mmlu": "MMLU",
45
- "alpacav1": "AlpacaEval V1",
46
- "magi": "MAGI",
47
- "alpacaeval2_lc": "AlpacaEval V2 Length Adjusted",
48
- "gpt4all": "GPT-4-All",
49
- "humaneval": "HumanEval",
50
- "mbpp": "MBPP",
51
- "hellaswag": "HellaSwag",
52
- "hugging_6": "HF OpenLLM V1",
53
- "winogrande": "Winogrande",
54
  }
55
 
56
- if bench_name in benchmarks_dict:
57
- return benchmarks_dict[bench_name]
58
  else:
59
  return bench_name
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  st.markdown(
63
  """<h1 style='text-align: center; color: black;'>πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹οΈβ€β™‚οΈ</h1>""",
64
  unsafe_allow_html=True,
@@ -70,7 +237,11 @@ st.markdown(
70
  )
71
 
72
 
73
- all_scenarios_for_aggragate = holistic_scenarios
 
 
 
 
74
 
75
  st.subheader("The Leaderboard", divider=True)
76
  # st.subheader("πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹", divider=True)
@@ -79,29 +250,26 @@ leftcol, rightcol = st.columns([2, 1])
79
 
80
  with st.expander("Leaderboard configurations (defaults are great BTW)", icon="βš™οΈ"):
81
  with st.form("my_form"):
82
- all_scenarios_for_aggragate_with_all = all_scenarios_for_aggragate
83
- all_scenarios_for_aggragate_with_all.append("All Holistic")
 
 
84
 
85
  aggragate_scenarios = st.multiselect(
86
- "Scenarios in Aggregate",
87
- all_scenarios_for_aggragate_with_all,
88
- ["All Holistic"],
89
- # all_scenarios_for_aggragate,
90
  )
91
 
92
  corr_type = st.selectbox(
93
  label="Select Correlation type", options=["kendall", "pearson"], index=0
94
  )
95
 
96
- aggragate_scenario_blacklist = (
97
- [
98
- scen
99
- for scen in all_scenarios_for_aggragate
100
- if scen not in aggragate_scenarios
101
- ]
102
- if "All Holistic" not in aggragate_scenarios
103
- else []
104
- )
105
 
106
  model_select_strategy = st.selectbox(
107
  label="Select strategy",
@@ -109,7 +277,15 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
109
  index=0,
110
  )
111
 
112
- n_models_taken_list = [5]
 
 
 
 
 
 
 
 
113
  n_exps = 10
114
 
115
  submitted = st.form_submit_button(label="Run BAT")
@@ -197,27 +373,18 @@ def run_load(
197
  # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
198
 
199
  allbench.extend(my_benchmark)
200
- allbench.df = allbench.df.drop(columns=["tag"])
201
  allbench.clear_repeated_scenarios()
202
- allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
203
-
204
- # allbench.df = allbench.df[~allbench.df["scenario"].str.contains("_mixed")]
205
- # allbench.df = allbench.df[~allbench.df["scenario"].str.contains("agentbench")]
206
-
207
- # st.dataframe(holistic.df.query('scenario=="aggregate"'))
208
 
 
 
209
  allbench = allbench.extend(holistic)
210
 
211
  tester = Tester(cfg=cfg)
212
 
213
- # len(allbench.get_scenario_appearences_count().keys())
214
-
215
- allbench.df.query('source=="BlueBench"').model.unique()
216
-
217
- allbench.df.query('scenario=="aggregate"').model.unique()
218
-
219
  agreements = tester.all_vs_all_agreement_testing(
220
- allbench, single_source_scenario="aggregate"
 
221
  )
222
 
223
  agreements.to_csv(cache_path, index=False)
@@ -236,12 +403,20 @@ agreements = run_load(
236
 
237
  reporter = Reporter()
238
  z_scores = reporter.get_all_z_scores(agreements=agreements, aggragate_name="aggregate")
 
239
 
240
  corr_name = f"{'Kendall Tau' if corr_type=='kendall' else 'Per.'} Corr."
241
 
242
  z_scores["z_score"] = z_scores["z_score"].round(2)
243
  z_scores["corr_with_agg"] = z_scores["corr_with_agg"].round(2)
244
  z_scores["p_value_of_corr_with_agg"] = z_scores["p_value_of_corr_with_agg"].round(2)
 
 
 
 
 
 
 
245
 
246
  data = (
247
  z_scores.rename(
@@ -249,7 +424,8 @@ data = (
249
  "scenario": "Benchmark",
250
  "z_score": "Z Score",
251
  "corr_with_agg": corr_name,
252
- "p_value_of_corr_with_agg": "p value of Corr.",
 
253
  "source": "Source",
254
  }
255
  )
@@ -258,12 +434,6 @@ data = (
258
  )
259
 
260
 
261
- data = data[~data["Source"].str.contains("livebench")]
262
- data = data[~data["Source"].str.contains("biggen")]
263
- # data.drop(columns=["Source"], inplace=True)
264
- data["Benchmark"] = data["Benchmark"].apply(lambda x: get_nice_benchmark_name(x))
265
-
266
-
267
  # Apply coloring based on 'Z' valuesz
268
  def highlight_uploaded_benchmark(row):
269
  if row["Source"] == "Uploaded Benchmark":
@@ -279,16 +449,23 @@ styled_data = (
279
  vmin=-data["Z Score"].abs().max(),
280
  vmax=data["Z Score"].abs().max(),
281
  )
282
- .format(subset=["Z Score", corr_name, "p value of Corr."], formatter="{:.2}")
283
  .apply(highlight_uploaded_benchmark, axis=1)
 
 
 
 
 
 
 
284
  )
285
 
 
286
 
287
  st.dataframe(
288
  data=styled_data,
289
  hide_index=True,
290
  use_container_width=True,
291
- height=300,
292
  )
293
 
294
  st.markdown(
@@ -309,7 +486,9 @@ st.write(r"""
309
 
310
  benchmarks = data["Benchmark"].unique().tolist()
311
  plotted_scenario = st.selectbox(
312
- "Choose Benchmark to plot", benchmarks, index=benchmarks.index("LMSys Arena")
 
 
313
  )
314
 
315
 
 
6
  import streamlit as st
7
  from bat import Benchmark, Config, Reporter, Tester
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def get_nice_benchmark_name(bench_name):
11
+ prettified_names = {
12
+ "holmes": "Holmes",
13
+ "helm_lite_narrativeqa": "Helm Lite NarrativeQA",
14
+ "helm_lite_naturalquestionsopen": "Helm Lite NaturalQuestionsOpen",
15
+ "helm_lite_naturalquestionsclosed": "Helm Lite NaturalQuestionsClosed",
16
+ "helm_lite_openbookqa": "Helm Lite OpenBookQA",
17
+ "helm_lite_mmlu": "Helm Lite MMLU",
18
+ "helm_lite_math_equivalentcot": "Helm Lite MathEquivalentCOT",
19
+ "helm_lite_gsm8k": "Helm Lite GSM8K",
20
+ "helm_lite_legalbench": "Helm Lite LegalBench",
21
+ "helm_lite_medqa": "Helm Lite MedQA",
22
+ "helm_lite_wmt2014": "Helm Lite WMT2014",
23
+ "hfv2_bbh": "HFv2 BBH",
24
+ "hfv2_bbh_raw": "HFv2 BBH Raw",
25
+ "hfv2_gpqa": "HFv2 GPQA",
26
+ "hfv2_ifeval": "HFv2 IFEval",
27
+ "hfv2_math_lvl_5": "HFv2 Math Level 5",
28
+ "hfv2_mmlu_pro": "HFv2 MMLU Pro",
29
+ "hfv2_musr": "HFv2 MuSR",
30
+ "oc_mmlu": "OpenCompass MMLU",
31
+ "oc_mmlu_pro": "OpenCompass MMLU Pro",
32
+ "oc_cmmlu": "OpenCompass CMMLU",
33
+ "oc_bbh": "OpenCompass BBH",
34
+ "oc_gqpa_dimand": "OpenCompass GQPA-Dimand",
35
+ "oc_humaneval": "OpenCompass HumanEval",
36
+ "oc_ifeval": "OpenCompass IFEval",
37
+ "helm_mmlu": "Helm MMLU",
38
+ "helm_boolq": "Helm BoolQ",
39
+ "helm_narrativeqa": "Helm NarrativeQA",
40
+ "helm_naturalquestionsclosed": "Helm NaturalQuestionsClosed",
41
+ "helm_naturalquestionsopen": "Helm NaturalQuestionsOpen",
42
+ "helm_quac": "Helm QuAC",
43
+ "helm_openbookqa": "Helm OpenBookQA",
44
+ "helm_imdb": "Helm IMDB",
45
+ "helm_civilcomments": "Helm CivilComments",
46
+ "helm_raft": "Helm RAFT",
47
+ "mmlu_pro": "MMLU Pro",
48
+ "mixeval_triviaqa": "MixEval TriviaQA",
49
+ "mixeval_mmlu": "MixEval MMLU",
50
+ "mixeval_drop": "MixEval DROP",
51
+ "mixeval_hellaswag": "MixEval HellaSwag",
52
+ "mixeval_commonsenseqa": "MixEval CommonsenseQA",
53
+ "mixeval_triviaqa_hard": "MixEval TriviaQA Hard",
54
+ "mixeval_mmlu_hard": "MixEval MMLU Hard",
55
+ "mixeval_drop_hard": "MixEval DROP Hard",
56
+ "oc_language": "OpenCompass Language",
57
+ "oc_knowledge": "OpenCompass Knowledge",
58
+ "oc_reasoning": "OpenCompass Reasoning",
59
+ "oc_math": "OpenCompass Math",
60
+ "oc_code": "OpenCompass Code",
61
+ "oc_instruct": "OpenCompass Instruction",
62
+ "oc_agent": "OpenCompass Agent",
63
+ "oc_arena": "OpenCompass Arena",
64
+ "lb_reasoning": "LiveBench Reasoning",
65
+ "lb_coding": "LiveBench Coding",
66
+ "lb_mathematics": "LiveBench Mathematics",
67
+ "lb_data_analysis": "LiveBench Data Analysis",
68
+ "lb_language": "LiveBench Language",
69
+ "lb_if": "LiveBench Instruction Following",
70
+ "wb_info_seek": "WildBench Information Seeking",
71
+ "wb_creative": "WildBench Creative",
72
+ "wb_code_debug": "WildBench Code Debugging",
73
+ "wb_math_data": "WildBench Math & Data",
74
+ "wb_reason_plan": "WildBench Reasoning & Planning",
75
+ "wb_score": "WildBench Score",
76
+ "hfv1_arc": "HFv1 ARC",
77
+ "hfv1_gsm8k": "HFv1 GSM8K",
78
+ "hfv1_hellaswag": "HFv1 HellaSwag",
79
+ "hfv1_mmlu": "HFv1 MMLU",
80
+ "hfv1_truthfulqa": "HFv1 TruthfulQA",
81
+ "hfv1_winogrande": "HFv1 Winogrande",
82
+ "biggen_grounding": "BigBench Grounding",
83
+ "biggen_instruction_following": "BigBench Instruction Following",
84
+ "biggen_planning": "BigBench Planning",
85
+ "biggen_reasoning": "BigBench Reasoning",
86
+ "biggen_refinement": "BigBench Refinement",
87
+ "biggen_safety": "BigBench Safety",
88
+ "biggen_theory_of_mind": "BigBench Theory of Mind",
89
+ "biggen_tool_usage": "BigBench Tool Usage",
90
+ "biggen_multilingual": "BigBench Multilingual",
91
+ "lb_reasoning_average": "LiveBench Reasoning Average",
92
+ "lb_coding_average": "LiveBench Coding Average",
93
+ "lb_mathematics_average": "LiveBench Mathematics Average",
94
+ "lb_data_analysis_average": "LiveBench Data Analysis Average",
95
+ "lb_language_average": "LiveBench Language Average",
96
+ "lb_if_average": "LiveBench Instruction Following Average",
97
+ "helm_lite": "Helm Lite",
98
+ "hf_open_llm_v2": "HF OpenLLM v2",
99
+ "opencompass_academic": "OpenCompass Academic",
100
+ "arena_elo": "Arena Elo",
101
+ "helm_classic": "Helm Classic",
102
+ "mixeval": "MixEval",
103
+ "mixeval_hard": "MixEval Hard",
104
+ "opencompass": "OpenCompass",
105
+ "alphacaeval_v2lc": "AlphacaEval v2lc",
106
+ "livebench_240725": "LiveBench 240725",
107
+ "wb_elo_lc": "WildBench Elo LC",
108
  "arena_hard": "Arena Hard",
109
+ "agentbench": "AgentBench",
110
+ "hf_open_llm_v1": "HF OpenLLM v1",
111
+ "biggen": "BigBench",
112
+ "livebench_240624": "LiveBench 240624",
113
+ "mt_bench": "MT-Bench",
 
 
 
 
 
 
 
 
 
 
114
  }
115
 
116
+ if bench_name in prettified_names:
117
+ return prettified_names[bench_name]
118
  else:
119
  return bench_name
120
 
121
 
122
+ holistic_scenarios = [
123
+ get_nice_benchmark_name(scen)
124
+ for scen in [
125
+ # "holmes",
126
+ "helm_lite",
127
+ # "narrativeqa",
128
+ # "naturalquestionsopen",
129
+ # "naturalquestionsclosed",
130
+ # "openbookqa",
131
+ # "mmlu",
132
+ # "math_equivalentcot",
133
+ # "gsm8k",
134
+ # "legalbench",
135
+ # "medqa",
136
+ # "wmt2014",
137
+ # "arc_c",
138
+ # "arc_e",
139
+ # "boolq",
140
+ # "csqa",
141
+ # "hellaswag",
142
+ # "piqa",
143
+ # "siqa",
144
+ # "winogrande",
145
+ # "olmes_average",
146
+ # "bbh",
147
+ # "bbh_raw",
148
+ # "gpqa",
149
+ "hf_open_llm_v2",
150
+ # "ifeval",
151
+ # "math_lvl_5",
152
+ # "mmlu_pro",
153
+ # "musr",
154
+ "opencompass_academic",
155
+ # "oc_mmlu",
156
+ # "oc_mmlu_pro",
157
+ # "oc_cmmlu",
158
+ # "oc_bbh",
159
+ # "oc_gqpa_dimand",
160
+ # "oc_math",
161
+ # "oc_humaneval",
162
+ # "oc_ifeval",
163
+ # "helm_mmlu",
164
+ "arena_elo",
165
+ "helm_classic",
166
+ # "quac",
167
+ # "truthfulqa",
168
+ # "ms_marcoregular",
169
+ # "ms_marcotrec",
170
+ # "cnn/dailymail",
171
+ # "xsum",
172
+ # "imdb",
173
+ # "civilcomments",
174
+ # "raft",
175
+ "mixeval_hard",
176
+ "mixeval",
177
+ # "arena_elo0527",
178
+ "opencompass",
179
+ # "oc_language",
180
+ # "oc_knowledge",
181
+ # "oc_reasoning",
182
+ # "oc_code",
183
+ # "oc_instruct",
184
+ # "oc_agent",
185
+ # "oc_arena",
186
+ "alphacaeval_v2lc",
187
+ "livebench_240725",
188
+ "livebench_240624",
189
+ # "lb_reasoning",
190
+ # "lb_coding",
191
+ # "lb_mathematics",
192
+ # "lb_data_analysis",
193
+ # "lb_language",
194
+ # "lb_if",
195
+ "wb_elo_lc",
196
+ # "wb_info_seek",
197
+ # "wb_creative",
198
+ # "wb_code_debug",
199
+ # "wb_math_data",
200
+ # "wb_reason_plan",
201
+ # "wb_score",
202
+ # "boolqmixed",
203
+ "arena_hard",
204
+ "agentbench",
205
+ # "arc",
206
+ "hf_open_llm_v1",
207
+ "biggen",
208
+ # "biggen_grounding",
209
+ # "biggen_instruction_following",
210
+ # "biggen_planning",
211
+ # "biggen_reasoning",
212
+ # "biggen_refinement",
213
+ # "biggen_safety",
214
+ # "biggen_theory_of_mind",
215
+ # "biggen_tool_usage",
216
+ # "biggen_multilingual",
217
+ # "lb_global_average",
218
+ # "lb_reasoning_average",
219
+ # "lb_coding_average",
220
+ # "lb_mathematics_average",
221
+ # "lb_data_analysis_average",
222
+ # "lb_language_average",
223
+ # "lb_if_average",
224
+ # "mt_bench",
225
+ ]
226
+ ]
227
+
228
+
229
  st.markdown(
230
  """<h1 style='text-align: center; color: black;'>πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹οΈβ€β™‚οΈ</h1>""",
231
  unsafe_allow_html=True,
 
237
  )
238
 
239
 
240
+ all_scenarios_for_aggragate = Benchmark()
241
+ all_scenarios_for_aggragate.load_local_catalog()
242
+ all_scenarios_for_aggragate = (
243
+ all_scenarios_for_aggragate.df["scenario"].unique().tolist()
244
+ )
245
 
246
  st.subheader("The Leaderboard", divider=True)
247
  # st.subheader("πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹", divider=True)
 
250
 
251
  with st.expander("Leaderboard configurations (defaults are great BTW)", icon="βš™οΈ"):
252
  with st.form("my_form"):
253
+ all_scenarios_for_aggragate_with_all = [
254
+ get_nice_benchmark_name(scenario)
255
+ for scenario in all_scenarios_for_aggragate
256
+ ]
257
 
258
  aggragate_scenarios = st.multiselect(
259
+ "Scenarios in Aggregate (defualts are the 'Holistic' benchmarks)",
260
+ all_scenarios_for_aggragate,
261
+ holistic_scenarios,
 
262
  )
263
 
264
  corr_type = st.selectbox(
265
  label="Select Correlation type", options=["kendall", "pearson"], index=0
266
  )
267
 
268
+ aggragate_scenario_blacklist = [
269
+ scen
270
+ for scen in all_scenarios_for_aggragate
271
+ if scen not in aggragate_scenarios
272
+ ]
 
 
 
 
273
 
274
  model_select_strategy = st.selectbox(
275
  label="Select strategy",
 
277
  index=0,
278
  )
279
 
280
+ n_models_taken_list = st.slider(
281
+ label="Select number of models to use",
282
+ min_value=3,
283
+ max_value=20,
284
+ value=10,
285
+ )
286
+
287
+ n_models_taken_list = [n_models_taken_list]
288
+
289
  n_exps = 10
290
 
291
  submitted = st.form_submit_button(label="Run BAT")
 
373
  # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
374
 
375
  allbench.extend(my_benchmark)
376
+ # allbench.df = allbench.df.drop(columns=["tag"])
377
  allbench.clear_repeated_scenarios()
 
 
 
 
 
 
378
 
379
+ # removing and adding the holistic scenarios
380
+ allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
381
  allbench = allbench.extend(holistic)
382
 
383
  tester = Tester(cfg=cfg)
384
 
 
 
 
 
 
 
385
  agreements = tester.all_vs_all_agreement_testing(
386
+ allbench,
387
+ single_source_scenario="aggregate", # olny measuring all with the aggragate
388
  )
389
 
390
  agreements.to_csv(cache_path, index=False)
 
403
 
404
  reporter = Reporter()
405
  z_scores = reporter.get_all_z_scores(agreements=agreements, aggragate_name="aggregate")
406
+ z_scores.drop(columns=["n_models_of_corr_with_agg"], inplace=True)
407
 
408
  corr_name = f"{'Kendall Tau' if corr_type=='kendall' else 'Per.'} Corr."
409
 
410
  z_scores["z_score"] = z_scores["z_score"].round(2)
411
  z_scores["corr_with_agg"] = z_scores["corr_with_agg"].round(2)
412
  z_scores["p_value_of_corr_with_agg"] = z_scores["p_value_of_corr_with_agg"].round(2)
413
+ # z_scores["n_models_of_corr_with_agg"] = z_scores["n_models_of_corr_with_agg"].round(1)
414
+
415
+ z_scores["source"] = z_scores["source"].apply(lambda x: x.split(".csv")[0])
416
+
417
+ # print(z_scores["scenario"].unique().tolist())
418
+
419
+ z_scores["scenario"] = z_scores["scenario"].apply(lambda x: get_nice_benchmark_name(x))
420
 
421
  data = (
422
  z_scores.rename(
 
424
  "scenario": "Benchmark",
425
  "z_score": "Z Score",
426
  "corr_with_agg": corr_name,
427
+ "p_value_of_corr_with_agg": "p-value of Corr.",
428
+ # "n_models_of_corr_with_agg": "# Models Used",
429
  "source": "Source",
430
  }
431
  )
 
434
  )
435
 
436
 
 
 
 
 
 
 
437
  # Apply coloring based on 'Z' valuesz
438
  def highlight_uploaded_benchmark(row):
439
  if row["Source"] == "Uploaded Benchmark":
 
449
  vmin=-data["Z Score"].abs().max(),
450
  vmax=data["Z Score"].abs().max(),
451
  )
 
452
  .apply(highlight_uploaded_benchmark, axis=1)
453
+ .background_gradient(
454
+ subset=["p-value of Corr."],
455
+ cmap="Reds",
456
+ vmin=0.1,
457
+ vmax=1,
458
+ )
459
+ .format(subset=["Z Score", corr_name, "p-value of Corr."], formatter="{:.2}")
460
  )
461
 
462
+ print(data["Benchmark"].unique().tolist())
463
 
464
  st.dataframe(
465
  data=styled_data,
466
  hide_index=True,
467
  use_container_width=True,
468
+ height=500,
469
  )
470
 
471
  st.markdown(
 
486
 
487
  benchmarks = data["Benchmark"].unique().tolist()
488
  plotted_scenario = st.selectbox(
489
+ "Choose Benchmark to plot",
490
+ benchmarks,
491
+ index=benchmarks.index("Arena Elo"),
492
  )
493
 
494