Spaces:
Sleeping
Sleeping
Yotam-Perlitz
commited on
Commit
•
765f7ba
1
Parent(s):
a50e6f5
revise text
Browse filesSigned-off-by: Yotam-Perlitz <[email protected]>
app.py
CHANGED
@@ -26,11 +26,12 @@ st.markdown(
|
|
26 |
)
|
27 |
|
28 |
st.markdown(
|
29 |
-
"
|
30 |
-
|
|
|
|
|
31 |
)
|
32 |
|
33 |
-
|
34 |
all_scenarios_for_aggragate = Benchmark()
|
35 |
all_scenarios_for_aggragate.load_local_catalog()
|
36 |
all_scenarios_for_aggragate = (
|
@@ -128,8 +129,14 @@ with st.expander("Add your benchmarks here!", icon="🔥"):
|
|
128 |
overlap_models = set(aggregate_models).intersection(uploaded_models)
|
129 |
if len(overlap_models) < n_models_taken_list[0]:
|
130 |
st.warning(
|
131 |
-
f"You have just {len(overlap_models)} models intersecting with the aggregate
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
)
|
134 |
|
135 |
|
@@ -191,7 +198,7 @@ def run_load(
|
|
191 |
scenario_whitelist=aggregate_scenario_whitelist,
|
192 |
min_scenario_for_models_to_appear_in_agg=1
|
193 |
if len(aggregate_scenario_whitelist) == 1
|
194 |
-
else len(aggregate_scenario_whitelist) //
|
195 |
)
|
196 |
|
197 |
allbench.extend(my_benchmark)
|
|
|
26 |
)
|
27 |
|
28 |
st.markdown(
|
29 |
+
"""
|
30 |
+
This leaderboard, featured in our work -- [Benchmark Agreement Testing Done Right: A Guide for LLM Benchmark Evaluation](https://arxiv.org/abs/2407.13696),
|
31 |
+
serves as a meta-benchmark. It ranks individual benchmarks based on their agreement with an aggregated reference benchmark, which harnesses insights from numerous diverse benchmarks.
|
32 |
+
"""
|
33 |
)
|
34 |
|
|
|
35 |
all_scenarios_for_aggragate = Benchmark()
|
36 |
all_scenarios_for_aggragate.load_local_catalog()
|
37 |
all_scenarios_for_aggragate = (
|
|
|
129 |
overlap_models = set(aggregate_models).intersection(uploaded_models)
|
130 |
if len(overlap_models) < n_models_taken_list[0]:
|
131 |
st.warning(
|
132 |
+
f"You have just {len(overlap_models)} models intersecting with the aggregate!\n"
|
133 |
+
)
|
134 |
+
|
135 |
+
st.info(
|
136 |
+
f"Here are some models you could run your benchmark over:{[m for m in aggregate_models if m not in uploaded_models]}"
|
137 |
+
)
|
138 |
+
st.info(
|
139 |
+
f"Model that you have and the aggragate does not: {[m for m in uploaded_models if m not in aggregate_models]}"
|
140 |
)
|
141 |
|
142 |
|
|
|
198 |
scenario_whitelist=aggregate_scenario_whitelist,
|
199 |
min_scenario_for_models_to_appear_in_agg=1
|
200 |
if len(aggregate_scenario_whitelist) == 1
|
201 |
+
else len(aggregate_scenario_whitelist) // 3,
|
202 |
)
|
203 |
|
204 |
allbench.extend(my_benchmark)
|