Spaces:
Build error
Build error
import json | |
from pathlib import Path | |
import gradio as gr | |
import pandas as pd | |
TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for open-r1 Models</h1>""" | |
DESCRIPTION = f""" | |
Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy. | |
""" | |
BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"] | |
def get_leaderboard_df(): | |
filepaths = list(Path("eval_results").rglob("*.json")) | |
# Parse filepaths to get unique models | |
models = set() | |
for filepath in filepaths: | |
path_parts = Path(filepath).parts | |
model_revision = "_".join(path_parts[1:4]) | |
models.add(model_revision) | |
# Initialize DataFrame | |
df = pd.DataFrame(index=list(models)) | |
# Extract data from each file and populate the DataFrame | |
for filepath in filepaths: | |
path_parts = Path(filepath).parts | |
date = filepath.stem.split("_")[-1][:-3] | |
model_revision = "_".join(path_parts[1:4]) + "_" + date | |
task = path_parts[4] | |
df.loc[model_revision, "Date"] = date | |
with open(filepath, "r") as file: | |
data = json.load(file) | |
# Skip benchmarks that we don't want to include in the leaderboard | |
if task.lower() in BENCHMARKS_TO_SKIP: | |
continue | |
# MixEval doen't have a results key, so we need to get the overall score | |
if task.lower() in ["mixeval", "mixeval_hard"]: | |
value = data["overall score (final score)"] | |
df.loc[model_revision, f"{task}"] = value | |
else: | |
first_result_key = next(iter(data["results"])) # gets the first key in 'results' | |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard | |
if task.lower() == "truthfulqa": | |
value = data["results"][first_result_key]["truthfulqa_mc2"] | |
df.loc[model_revision, task] = float(value) | |
# IFEval has several metrics but we report the average like Llama3 paper | |
elif task.lower() == "ifeval": | |
values = 0.0 | |
for metric in [ | |
"prompt_level_loose", | |
"prompt_level_strict", | |
"inst_level_strict", | |
"inst_level_loose", | |
]: | |
values += data["results"][first_result_key][f"{metric}_acc"] | |
value = values / 4 | |
df.loc[model_revision, f"{task}"] = float(value) | |
# MMLU has several metrics but we report just the average one | |
elif task.lower() == "mmlu": | |
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0] | |
df.loc[model_revision, task] = float(value) | |
# HellaSwag and ARC reports acc_norm | |
elif task.lower() in ["hellaswag", "arc"]: | |
value = data["results"][first_result_key]["acc_norm"] | |
df.loc[model_revision, task] = float(value) | |
# BBH has several metrics but we report just the average one | |
elif task.lower() == "bbh": | |
if "all" in data["results"]: | |
value = data["results"]["all"]["acc"] | |
else: | |
value = -100 | |
df.loc[model_revision, task] = float(value) | |
# AGIEval reports acc_norm | |
elif task.lower() == "agieval": | |
value = data["results"]["all"]["acc_norm"] | |
df.loc[model_revision, task] = float(value) | |
# MATH reports qem | |
elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]: | |
value = data["results"]["all"]["qem"] | |
df.loc[model_revision, task] = float(value) | |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe | |
elif task.lower() in ["mini_math_v2"]: | |
for k, v in data["results"].items(): | |
if k != "all": | |
level = k.split("|")[1].split(":")[-1] | |
value = v["qem"] | |
df.loc[model_revision, f"{task}_{level}"] = value | |
# For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe | |
elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]: | |
for k, v in data["results"].items(): | |
if k != "all" and "_average" not in k: | |
version = k.split("|")[1].split(":")[-1] | |
value = v["qem"] if "qem" in v else v["score"] | |
df.loc[model_revision, f"{task}_{version}"] = value | |
# For kaggle_tora we report accuracy as a percentage, so need to divide by 100 | |
elif task.lower() in [ | |
"aimo_tora_eval_kaggle_medium", | |
"aimo_tora_eval_kaggle_hard", | |
"aimo_kaggle_fast_eval_hard", | |
"aimo_kaggle_tora_medium", | |
"aimo_kaggle_tora_hard", | |
"aimo_kaggle_tora_medium_extended", | |
"aimo_kaggle_tora_hard_extended", | |
"aimo_math_integer_lvl4", | |
"aimo_math_integer_lvl5", | |
]: | |
for k, v in data["results"].items(): | |
value = float(v["qem"]) / 100.0 | |
df.loc[model_revision, f"{task}"] = value | |
# For AlpacaEval we report base winrate and lenght corrected one | |
elif task.lower() == "alpaca_eval": | |
value = data["results"][first_result_key]["win_rate"] | |
df.loc[model_revision, "Alpaca_eval"] = value / 100.0 | |
value = data["results"][first_result_key]["length_controlled_winrate"] | |
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0 | |
else: | |
first_metric_key = next( | |
iter(data["results"][first_result_key]) | |
) # gets the first key in the first result | |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric | |
df.loc[model_revision, task] = float(value) | |
# Drop rows where every entry is NaN | |
df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"]) | |
# Trim minimath column names | |
df.columns = [c.replace("_level_", "_l") for c in df.columns] | |
# Trim AIMO column names | |
df.columns = [c.replace("aimo_", "") for c in df.columns] | |
df = df.reset_index().rename(columns={"index": "Model"}).round(4) | |
# Strip off date from model name | |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0]) | |
return df | |
leaderboard_df = get_leaderboard_df() | |
def agg_df(df, agg: str = "max"): | |
df = df.copy() | |
# Drop date and aggregate results by model name | |
df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index() | |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True)) | |
# Convert all values to percentage | |
df[df.select_dtypes(include=["number"]).columns] *= 100.0 | |
df = df.sort_values(by=["Average"], ascending=False) | |
return df | |
# Function to update the table based on search query | |
def filter_and_search(cols: list[str], search_query: str, agg: str): | |
df = leaderboard_df | |
df = agg_df(df, agg) | |
if len(search_query) > 0: | |
search_terms = search_query.split(";") | |
search_terms = [term.strip().lower() for term in search_terms] | |
pattern = "|".join(search_terms) | |
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)] | |
# Drop any columns which are all NaN | |
df = df.dropna(how="all", axis=1) | |
if len(cols) > 0: | |
index_cols = list(leaderboard_df.columns[:1]) | |
new_cols = index_cols + cols | |
df = df.copy()[new_cols] | |
# Drop rows with NaN values | |
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols]) | |
# Recompute average | |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True)) | |
return df | |
demo = gr.Blocks() | |
with demo: | |
gr.HTML(TITLE) | |
with gr.Column(): | |
gr.Markdown(DESCRIPTION, elem_classes="markdown-text") | |
with gr.Row(): | |
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) | |
agg = gr.Radio( | |
["min", "max", "mean"], | |
value="max", | |
label="Aggregation", | |
info="How to aggregate results for each model", | |
) | |
with gr.Row(): | |
cols_bar = gr.CheckboxGroup( | |
choices=[c for c in leaderboard_df.columns[1:] if c != "Average"], | |
show_label=False, | |
info="Select columns to display", | |
) | |
with gr.Group(): | |
leaderboard_table = gr.Dataframe( | |
value=leaderboard_df, | |
wrap=True, | |
height=1000, | |
column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]], | |
) | |
cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table]) | |
agg.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table]) | |
search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table]) | |
demo.launch() | |