open-r1-eval-leaderboard

Running

App Files Files Community

lewtun HF Staff commited on Apr 10

Commit

4563c79

1 Parent(s): f5a9775

Upgrade Gradio table

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +27 -6
debug.ipynb → dev.ipynb +0 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: R1-distilled leaderboard
 emoji: ⚡
 colorFrom: gray
 colorTo: red
 sdk: gradio
-sdk_version: 4.21.0
 app_file: app.py
 pinned: true
 ---

 ---
+title: Large Reasoning Models Leaderboard
 emoji: ⚡
 colorFrom: gray
 colorTo: red
 sdk: gradio
+sdk_version: 5.24.0
 app_file: app.py
 pinned: true
 ---

app.py CHANGED Viewed

@@ -4,10 +4,10 @@ from pathlib import Path
 import gradio as gr
 import pandas as pd
-TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for open-r1 Models</h1>"""
 DESCRIPTION = f"""
-Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
 BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
@@ -80,6 +80,14 @@ def get_leaderboard_df():
                 elif task.lower() == "agieval":
                     value = data["results"]["all"]["acc_norm"]
                     df.loc[model_revision, task] = float(value)
                 # MATH reports qem
                 elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
                     value = data["results"]["all"]["qem"]
@@ -135,7 +143,10 @@ def get_leaderboard_df():
     # Trim AIMO column names
     df.columns = [c.replace("aimo_", "") for c in df.columns]
-    df = df.reset_index().rename(columns={"index": "Model"}).round(4)
     # Strip off date from model name
     df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
@@ -154,6 +165,9 @@ def agg_df(df, agg: str = "max"):
     # Convert all values to percentage
     df[df.select_dtypes(include=["number"]).columns] *= 100.0
     df = df.sort_values(by=["Average"], ascending=False)
     return df
@@ -177,6 +191,9 @@ def filter_and_search(cols: list[str], search_query: str, agg: str):
         df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
         # Recompute average
         df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
     return df
@@ -187,7 +204,9 @@ with demo:
     with gr.Column():
         gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
         with gr.Row():
-            search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
             agg = gr.Radio(
                 ["min", "max", "mean"],
                 value="max",
@@ -196,7 +215,7 @@ with demo:
             )
         with gr.Row():
             cols_bar = gr.CheckboxGroup(
-                choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
                 show_label=False,
                 info="Select columns to display",
             )
@@ -204,8 +223,10 @@ with demo:
             leaderboard_table = gr.Dataframe(
                 value=leaderboard_df,
                 wrap=True,
-                height=1000,
                 column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
             )
     cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])

 import gradio as gr
 import pandas as pd
+TITLE = """<h1 align="center" id="space-title">Large Reasoning Models Leaderboard</h1>"""
 DESCRIPTION = f"""
+Evaluation of Open R1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
 BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
                 elif task.lower() == "agieval":
                     value = data["results"]["all"]["acc_norm"]
                     df.loc[model_revision, task] = float(value)
+                # AIME24 and 25 report pass@1
+                elif task.lower() in ["aime24", "aime25"]:
+                    value = (
+                        data["results"]["all"]["math_pass@1:32_samples"]
+                        if "math_pass@1:32_samples" in data["results"]["all"]
+                        else -1
+                    )
+                    df.loc[model_revision, task] = float(value)
                 # MATH reports qem
                 elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
                     value = data["results"]["all"]["qem"]
     # Trim AIMO column names
     df.columns = [c.replace("aimo_", "") for c in df.columns]
+    df = df.reset_index().rename(columns={"index": "Model"})
+    # Apply rounding only to numeric columns
+    numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
+    df[numeric_cols] = df[numeric_cols].round(4)
     # Strip off date from model name
     df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
     # Convert all values to percentage
     df[df.select_dtypes(include=["number"]).columns] *= 100.0
+    # Apply rounding only to numeric columns
+    numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
+    df[numeric_cols] = df[numeric_cols].round(4)
     df = df.sort_values(by=["Average"], ascending=False)
     return df
         df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
         # Recompute average
         df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
+        # Apply rounding only to numeric columns
+        numeric_cols = df.select_dtypes(include=["float64", "float32", "int64", "int32"]).columns
+        df[numeric_cols] = df[numeric_cols].round(4)
     return df
     with gr.Column():
         gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
         with gr.Row():
+            search_bar = gr.Textbox(
+                placeholder="Search for your model. Use semicolons for multiple terms", show_label=False
+            )
             agg = gr.Radio(
                 ["min", "max", "mean"],
                 value="max",
             )
         with gr.Row():
             cols_bar = gr.CheckboxGroup(
+                choices=sorted([c for c in leaderboard_df.columns[1:] if c not in ["Average", "Date"]]),
                 show_label=False,
                 info="Select columns to display",
             )
             leaderboard_table = gr.Dataframe(
                 value=leaderboard_df,
                 wrap=True,
+                max_height=1000,
                 column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
+                show_row_numbers=True,
+                show_copy_button=True,
             )
     cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])

debug.ipynb → dev.ipynb RENAMED Viewed

File without changes