Spaces:
Sleeping
Sleeping
import io | |
import json | |
import gradio as gr | |
import pandas as pd | |
from huggingface_hub import HfFileSystem | |
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results" | |
EXCLUDED_KEYS = { | |
"pretty_env_info", | |
"chat_template", | |
"group_subtasks", | |
} | |
EXCLUDED_RESULTS_KEYS = { | |
"leaderboard", | |
} | |
EXCLUDED_RESULTS_LEADERBOARDS_KEYS = { | |
"alias", | |
} | |
DEFAULT_HTML_TABLE = """ | |
<table> | |
<thead> | |
<tr> | |
<th>Parameters</th> | |
<th>Model-1</th> | |
<th>Model-2</th> | |
</tr> | |
</thead> | |
<tbody> | |
</tbody> | |
</table> | |
""" | |
TASKS = { | |
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"), | |
"leaderboard_bbh": ("BBH", "leaderboard_bbh"), | |
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"), | |
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"), | |
"leaderboard_math_hard": ("MATH", "leaderboard_math"), | |
"leaderboard_mmlu": ("MMLU", "leaderboard_mmlu"), | |
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"), | |
"leaderboard_musr": ("MuSR", "leaderboard_musr"), | |
} | |
fs = HfFileSystem() | |
def fetch_result_paths(): | |
paths = fs.glob(f"{RESULTS_DATASET_ID}/**/**/*.json") | |
return paths | |
def filter_latest_result_path_per_model(paths): | |
from collections import defaultdict | |
d = defaultdict(list) | |
for path in paths: | |
model_id, _ = path[len(RESULTS_DATASET_ID) +1:].rsplit("/", 1) | |
d[model_id].append(path) | |
return {model_id: max(paths) for model_id, paths in d.items()} | |
def get_result_path_from_model(model_id, result_path_per_model): | |
return result_path_per_model[model_id] | |
def load_data(result_path) -> pd.DataFrame: | |
with fs.open(result_path, "r") as f: | |
data = json.load(f) | |
return data | |
def load_result(model_id): | |
result_path = get_result_path_from_model(model_id, latest_result_path_per_model) | |
data = load_data(result_path) | |
df = to_dataframe(data) | |
result = [ | |
# to_vertical(df), | |
to_vertical(filter_results(df)), | |
to_vertical(filter_configs(df)), | |
] | |
return result | |
def to_vertical(df): | |
df = df.T.rename_axis("Parameters") | |
df.index = df.index.str.join(".") | |
return df | |
def to_dataframe(data): | |
df = pd.json_normalize([{key: value for key, value in data.items() if key not in EXCLUDED_KEYS}]) | |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple | |
df.columns = list(map(lambda x: tuple(x.split(".")), df.columns)) | |
df.index = [data.get("model_name", "Model")] | |
return df | |
def filter_results(df): | |
df = df.loc[:, df.columns.str[0] == "results"] | |
df = df.loc[:, ~df.columns.str[1].isin(EXCLUDED_RESULTS_KEYS)] | |
# df.columns.str[1].str = df.columns.str[1].str.removeprefix("leaderboard_") | |
df = df.loc[:, ~df.columns.str[2].isin(EXCLUDED_RESULTS_LEADERBOARDS_KEYS)] | |
df.columns = df.columns.str[1:] | |
df.columns = map(lambda x: (x[0].removeprefix("leaderboard_"), *x[1:]), df.columns) | |
return df | |
def filter_configs(df): | |
df = df.loc[:, df.columns.str[0] == "configs"] | |
# df = df.loc[:, ~df.columns.str[1].isin(EXCLUDED_RESULTS_KEYS)] | |
# df = df.loc[:, ~df.columns.str[2].isin(EXCLUDED_RESULTS_LEADERBOARDS_KEYS)] | |
df.columns = df.columns.str[1:] | |
df.columns = map(lambda x: (x[0].removeprefix("leaderboard_"), *x[1:]), df.columns) | |
return df | |
def concat_result_1(result_1, results): | |
results = pd.read_html(io.StringIO(results))[0] | |
df = ( | |
pd.concat([result_1, results.iloc[:, [0, 2]].set_index("Parameters")], axis=1) | |
.reset_index() | |
) | |
return df | |
def display_dataframe(df): | |
# style = Styler(df, uuid_len=0, cell_ids=False) | |
return ( | |
df.style | |
.format(na_rep="") | |
.hide(axis="index") | |
.to_html() | |
) | |
def concat_result_2(result_2, results): | |
results = pd.read_html(io.StringIO(results))[0] | |
df = ( | |
pd.concat([results.iloc[:, [0, 1]].set_index("Parameters"), result_2], axis=1) | |
.reset_index() | |
) | |
return df | |
def render_result_1(model_id, task, *results): | |
result = load_result(model_id) | |
concat_results = [concat_result_1(*result_args) for result_args in zip(result, results)] | |
if task: | |
concat_results = [df[df["Parameters"].str.startswith(task[len("leaderboard_"):])] for df in concat_results] | |
return [display_dataframe(df) for df in concat_results] | |
def render_result_2(model_id, task, *results): | |
result = load_result(model_id) | |
concat_results = [concat_result_2(*result_args) for result_args in zip(result, results)] | |
if task: | |
concat_results = [df[df["Parameters"].str.startswith(task[len("leaderboard_"):])] for df in concat_results] | |
return [display_dataframe(df) for df in concat_results] | |
def render_results(model_id_1, model_id_2, task, *results): | |
results = render_result_1(model_id_1, task, *results) | |
return render_result_2(model_id_2, task, *results) | |
# if __name__ == "__main__": | |
latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths()) | |
with gr.Blocks(fill_height=True) as demo: | |
gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>") | |
gr.HTML("<h3 style='text-align: center;'>Select 2 results to load and compare</h3>") | |
with gr.Row(): | |
with gr.Column(): | |
model_id_1 = gr.Dropdown(choices=list(latest_result_path_per_model.keys()), label="Results") | |
load_btn_1 = gr.Button("Load") | |
with gr.Column(): | |
model_id_2 = gr.Dropdown(choices=list(latest_result_path_per_model.keys()), label="Results") | |
load_btn_2 = gr.Button("Load") | |
with gr.Row(): | |
task = gr.Radio( | |
["All"] + list(TASKS.values()), | |
label="Tasks", | |
info="Evaluation tasks to be displayed", | |
value="All", | |
) | |
results = [] | |
with gr.Row(): | |
# with gr.Tab("All"): | |
# # results.append(gr.Dataframe( | |
# # label="Results", | |
# # headers=["Parameters", "Model-1", "Model-2"], | |
# # interactive=False, | |
# # column_widths=["30%", "30%", "30%"], | |
# # wrap=True, | |
# # )) | |
# results.append(gr.HTML(value=DEFAULT_HTML_TABLE)) | |
with gr.Tab("Results"): | |
# results.append(gr.Dataframe( | |
# label="Results", | |
# headers=["Parameters", "Model-1", "Model-2"], | |
# interactive=False, | |
# column_widths=["30%", "30%", "30%"], | |
# wrap=True, | |
# )) | |
results.append(gr.HTML(value=DEFAULT_HTML_TABLE)) | |
with gr.Tab("Configs"): | |
# results.append(gr.Dataframe( | |
# label="Results", | |
# headers=["Parameters", "Model-1", "Model-2"], | |
# interactive=False, | |
# column_widths=["30%", "30%", "30%"], | |
# wrap=True, | |
# )) | |
results.append(gr.HTML(value=DEFAULT_HTML_TABLE)) | |
load_btn_1.click( | |
fn=render_result_1, | |
inputs=[model_id_1, task, *results], | |
outputs=[*results], | |
) | |
load_btn_2.click( | |
fn=render_result_2, | |
inputs=[model_id_2, task, *results], | |
outputs=[*results], | |
) | |
task.change( | |
fn=render_results, | |
inputs=[model_id_1, model_id_2, task, *results], | |
outputs=[*results], | |
) | |
demo.launch() | |