Upgrade Gradio table
Browse files- README.md +2 -2
- app.py +27 -6
- debug.ipynb → dev.ipynb +0 -0
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: ⚡
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
---
|
|
|
1 |
---
|
2 |
+
title: Large Reasoning Models Leaderboard
|
3 |
emoji: ⚡
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.24.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
---
|
app.py
CHANGED
@@ -4,10 +4,10 @@ from pathlib import Path
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
-
TITLE = """<h1 align="center" id="space-title">
|
8 |
|
9 |
DESCRIPTION = f"""
|
10 |
-
Evaluation of
|
11 |
"""
|
12 |
|
13 |
BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
|
@@ -80,6 +80,14 @@ def get_leaderboard_df():
|
|
80 |
elif task.lower() == "agieval":
|
81 |
value = data["results"]["all"]["acc_norm"]
|
82 |
df.loc[model_revision, task] = float(value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
# MATH reports qem
|
84 |
elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
|
85 |
value = data["results"]["all"]["qem"]
|
@@ -135,7 +143,10 @@ def get_leaderboard_df():
|
|
135 |
# Trim AIMO column names
|
136 |
df.columns = [c.replace("aimo_", "") for c in df.columns]
|
137 |
|
138 |
-
df = df.reset_index().rename(columns={"index": "Model"})
|
|
|
|
|
|
|
139 |
# Strip off date from model name
|
140 |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
|
141 |
|
@@ -154,6 +165,9 @@ def agg_df(df, agg: str = "max"):
|
|
154 |
|
155 |
# Convert all values to percentage
|
156 |
df[df.select_dtypes(include=["number"]).columns] *= 100.0
|
|
|
|
|
|
|
157 |
df = df.sort_values(by=["Average"], ascending=False)
|
158 |
return df
|
159 |
|
@@ -177,6 +191,9 @@ def filter_and_search(cols: list[str], search_query: str, agg: str):
|
|
177 |
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
|
178 |
# Recompute average
|
179 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
|
|
|
|
|
|
180 |
return df
|
181 |
|
182 |
|
@@ -187,7 +204,9 @@ with demo:
|
|
187 |
with gr.Column():
|
188 |
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
|
189 |
with gr.Row():
|
190 |
-
search_bar = gr.Textbox(
|
|
|
|
|
191 |
agg = gr.Radio(
|
192 |
["min", "max", "mean"],
|
193 |
value="max",
|
@@ -196,7 +215,7 @@ with demo:
|
|
196 |
)
|
197 |
with gr.Row():
|
198 |
cols_bar = gr.CheckboxGroup(
|
199 |
-
choices=[c for c in leaderboard_df.columns[1:] if c
|
200 |
show_label=False,
|
201 |
info="Select columns to display",
|
202 |
)
|
@@ -204,8 +223,10 @@ with demo:
|
|
204 |
leaderboard_table = gr.Dataframe(
|
205 |
value=leaderboard_df,
|
206 |
wrap=True,
|
207 |
-
|
208 |
column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
|
|
|
|
|
209 |
)
|
210 |
|
211 |
cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
|
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
+
TITLE = """<h1 align="center" id="space-title">Large Reasoning Models Leaderboard</h1>"""
|
8 |
|
9 |
DESCRIPTION = f"""
|
10 |
+
Evaluation of Open R1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
11 |
"""
|
12 |
|
13 |
BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
|
|
|
80 |
elif task.lower() == "agieval":
|
81 |
value = data["results"]["all"]["acc_norm"]
|
82 |
df.loc[model_revision, task] = float(value)
|
83 |
+
# AIME24 and 25 report pass@1
|
84 |
+
elif task.lower() in ["aime24", "aime25"]:
|
85 |
+
value = (
|
86 |
+
data["results"]["all"]["math_pass@1:32_samples"]
|
87 |
+
if "math_pass@1:32_samples" in data["results"]["all"]
|
88 |
+
else -1
|
89 |
+
)
|
90 |
+
df.loc[model_revision, task] = float(value)
|
91 |
# MATH reports qem
|
92 |
elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
|
93 |
value = data["results"]["all"]["qem"]
|
|
|
143 |
# Trim AIMO column names
|
144 |
df.columns = [c.replace("aimo_", "") for c in df.columns]
|
145 |
|
146 |
+
df = df.reset_index().rename(columns={"index": "Model"})
|
147 |
+
# Apply rounding only to numeric columns
|
148 |
+
numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
|
149 |
+
df[numeric_cols] = df[numeric_cols].round(4)
|
150 |
# Strip off date from model name
|
151 |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
|
152 |
|
|
|
165 |
|
166 |
# Convert all values to percentage
|
167 |
df[df.select_dtypes(include=["number"]).columns] *= 100.0
|
168 |
+
# Apply rounding only to numeric columns
|
169 |
+
numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
|
170 |
+
df[numeric_cols] = df[numeric_cols].round(4)
|
171 |
df = df.sort_values(by=["Average"], ascending=False)
|
172 |
return df
|
173 |
|
|
|
191 |
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
|
192 |
# Recompute average
|
193 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
194 |
+
# Apply rounding only to numeric columns
|
195 |
+
numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
|
196 |
+
df[numeric_cols] = df[numeric_cols].round(4)
|
197 |
return df
|
198 |
|
199 |
|
|
|
204 |
with gr.Column():
|
205 |
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
|
206 |
with gr.Row():
|
207 |
+
search_bar = gr.Textbox(
|
208 |
+
placeholder="Search for your model. Use semicolons for multiple terms", show_label=False
|
209 |
+
)
|
210 |
agg = gr.Radio(
|
211 |
["min", "max", "mean"],
|
212 |
value="max",
|
|
|
215 |
)
|
216 |
with gr.Row():
|
217 |
cols_bar = gr.CheckboxGroup(
|
218 |
+
choices=sorted([c for c in leaderboard_df.columns[1:] if c not in ["Average", "Date"]]),
|
219 |
show_label=False,
|
220 |
info="Select columns to display",
|
221 |
)
|
|
|
223 |
leaderboard_table = gr.Dataframe(
|
224 |
value=leaderboard_df,
|
225 |
wrap=True,
|
226 |
+
max_height=1000,
|
227 |
column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
|
228 |
+
show_row_numbers=True,
|
229 |
+
show_copy_button=True,
|
230 |
)
|
231 |
|
232 |
cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
|
debug.ipynb → dev.ipynb
RENAMED
File without changes
|