lewtun HF Staff commited on
Commit
4563c79
·
1 Parent(s): f5a9775

Upgrade Gradio table

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +27 -6
  3. debug.ipynb → dev.ipynb +0 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: R1-distilled leaderboard
3
  emoji: ⚡
4
  colorFrom: gray
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: true
10
  ---
 
1
  ---
2
+ title: Large Reasoning Models Leaderboard
3
  emoji: ⚡
4
  colorFrom: gray
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.24.0
8
  app_file: app.py
9
  pinned: true
10
  ---
app.py CHANGED
@@ -4,10 +4,10 @@ from pathlib import Path
4
  import gradio as gr
5
  import pandas as pd
6
 
7
- TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for open-r1 Models</h1>"""
8
 
9
  DESCRIPTION = f"""
10
- Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
11
  """
12
 
13
  BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
@@ -80,6 +80,14 @@ def get_leaderboard_df():
80
  elif task.lower() == "agieval":
81
  value = data["results"]["all"]["acc_norm"]
82
  df.loc[model_revision, task] = float(value)
 
 
 
 
 
 
 
 
83
  # MATH reports qem
84
  elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
85
  value = data["results"]["all"]["qem"]
@@ -135,7 +143,10 @@ def get_leaderboard_df():
135
  # Trim AIMO column names
136
  df.columns = [c.replace("aimo_", "") for c in df.columns]
137
 
138
- df = df.reset_index().rename(columns={"index": "Model"}).round(4)
 
 
 
139
  # Strip off date from model name
140
  df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
141
 
@@ -154,6 +165,9 @@ def agg_df(df, agg: str = "max"):
154
 
155
  # Convert all values to percentage
156
  df[df.select_dtypes(include=["number"]).columns] *= 100.0
 
 
 
157
  df = df.sort_values(by=["Average"], ascending=False)
158
  return df
159
 
@@ -177,6 +191,9 @@ def filter_and_search(cols: list[str], search_query: str, agg: str):
177
  df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
178
  # Recompute average
179
  df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
 
 
 
180
  return df
181
 
182
 
@@ -187,7 +204,9 @@ with demo:
187
  with gr.Column():
188
  gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
189
  with gr.Row():
190
- search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
 
 
191
  agg = gr.Radio(
192
  ["min", "max", "mean"],
193
  value="max",
@@ -196,7 +215,7 @@ with demo:
196
  )
197
  with gr.Row():
198
  cols_bar = gr.CheckboxGroup(
199
- choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
200
  show_label=False,
201
  info="Select columns to display",
202
  )
@@ -204,8 +223,10 @@ with demo:
204
  leaderboard_table = gr.Dataframe(
205
  value=leaderboard_df,
206
  wrap=True,
207
- height=1000,
208
  column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
 
 
209
  )
210
 
211
  cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
 
4
  import gradio as gr
5
  import pandas as pd
6
 
7
+ TITLE = """<h1 align="center" id="space-title">Large Reasoning Models Leaderboard</h1>"""
8
 
9
  DESCRIPTION = f"""
10
+ Evaluation of Open R1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
11
  """
12
 
13
  BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
 
80
  elif task.lower() == "agieval":
81
  value = data["results"]["all"]["acc_norm"]
82
  df.loc[model_revision, task] = float(value)
83
+ # AIME24 and 25 report pass@1
84
+ elif task.lower() in ["aime24", "aime25"]:
85
+ value = (
86
+ data["results"]["all"]["math_pass@1:32_samples"]
87
+ if "math_pass@1:32_samples" in data["results"]["all"]
88
+ else -1
89
+ )
90
+ df.loc[model_revision, task] = float(value)
91
  # MATH reports qem
92
  elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
93
  value = data["results"]["all"]["qem"]
 
143
  # Trim AIMO column names
144
  df.columns = [c.replace("aimo_", "") for c in df.columns]
145
 
146
+ df = df.reset_index().rename(columns={"index": "Model"})
147
+ # Apply rounding only to numeric columns
148
+ numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
149
+ df[numeric_cols] = df[numeric_cols].round(4)
150
  # Strip off date from model name
151
  df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
152
 
 
165
 
166
  # Convert all values to percentage
167
  df[df.select_dtypes(include=["number"]).columns] *= 100.0
168
+ # Apply rounding only to numeric columns
169
+ numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
170
+ df[numeric_cols] = df[numeric_cols].round(4)
171
  df = df.sort_values(by=["Average"], ascending=False)
172
  return df
173
 
 
191
  df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
192
  # Recompute average
193
  df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
194
+ # Apply rounding only to numeric columns
195
+ numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
196
+ df[numeric_cols] = df[numeric_cols].round(4)
197
  return df
198
 
199
 
 
204
  with gr.Column():
205
  gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
206
  with gr.Row():
207
+ search_bar = gr.Textbox(
208
+ placeholder="Search for your model. Use semicolons for multiple terms", show_label=False
209
+ )
210
  agg = gr.Radio(
211
  ["min", "max", "mean"],
212
  value="max",
 
215
  )
216
  with gr.Row():
217
  cols_bar = gr.CheckboxGroup(
218
+ choices=sorted([c for c in leaderboard_df.columns[1:] if c not in ["Average", "Date"]]),
219
  show_label=False,
220
  info="Select columns to display",
221
  )
 
223
  leaderboard_table = gr.Dataframe(
224
  value=leaderboard_df,
225
  wrap=True,
226
+ max_height=1000,
227
  column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
228
+ show_row_numbers=True,
229
+ show_copy_button=True,
230
  )
231
 
232
  cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
debug.ipynb → dev.ipynb RENAMED
File without changes