Yotam-Perlitz commited on
Commit
f32be22
1 Parent(s): 363d8ae

remove HFv2 BBH Raw

Browse files

Signed-off-by: Yotam-Perlitz <[email protected]>

Files changed (1) hide show
  1. app.py +7 -4
app.py CHANGED
@@ -75,8 +75,8 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
75
  n_models_taken_list = st.slider(
76
  label="Select number of models to use",
77
  min_value=3,
78
- max_value=20,
79
- value=10,
80
  )
81
 
82
  n_models_taken_list = [n_models_taken_list]
@@ -140,7 +140,7 @@ def run_load(
140
  corr_types=["kendall"],
141
  n_exps=10,
142
  my_benchmark=Benchmark(),
143
- use_caching=False,
144
  ):
145
  # Create a hash of the inputs to generate a unique cache file for each set of inputs
146
  input_str = (
@@ -182,13 +182,16 @@ def run_load(
182
  allbench = Benchmark()
183
  allbench.load_local_catalog()
184
 
 
 
 
185
  allbench.add_aggregate(
186
  new_col_name="aggregate",
187
  agg_source_name="aggregate",
188
  scenario_whitelist=aggregate_scenario_whitelist,
189
  min_scenario_for_models_to_appear_in_agg=1
190
  if len(aggregate_scenario_whitelist) == 1
191
- else 2,
192
  )
193
 
194
  allbench.extend(my_benchmark)
 
75
  n_models_taken_list = st.slider(
76
  label="Select number of models to use",
77
  min_value=3,
78
+ max_value=15,
79
+ value=8,
80
  )
81
 
82
  n_models_taken_list = [n_models_taken_list]
 
140
  corr_types=["kendall"],
141
  n_exps=10,
142
  my_benchmark=Benchmark(),
143
+ use_caching=True,
144
  ):
145
  # Create a hash of the inputs to generate a unique cache file for each set of inputs
146
  input_str = (
 
182
  allbench = Benchmark()
183
  allbench.load_local_catalog()
184
 
185
+ scenarios_to_drop = ["HFv2 BBH Raw"]
186
+ allbench.df = allbench.df.query("scenario not in @scenarios_to_drop")
187
+
188
  allbench.add_aggregate(
189
  new_col_name="aggregate",
190
  agg_source_name="aggregate",
191
  scenario_whitelist=aggregate_scenario_whitelist,
192
  min_scenario_for_models_to_appear_in_agg=1
193
  if len(aggregate_scenario_whitelist) == 1
194
+ else len(aggregate_scenario_whitelist) // 2,
195
  )
196
 
197
  allbench.extend(my_benchmark)