import gradio as gr | |
import pandas as pd | |
import json | |
from src.about import ( | |
REPRODUCIBILITY_TEXT, | |
INTRODUCTION_TEXT, | |
ABOUT_TEXT, | |
TITLE, | |
) | |
from src.display.css_html_js import custom_css, custom_js | |
# from src.display.utils import ( | |
# COLS, | |
# ST_BENCHMARK_COLS, | |
# AGENTIC_BENCHMARK_COLS, | |
# EVAL_COLS, | |
# AutoEvalColumn, | |
# fields, | |
# ) | |
# from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN | |
# from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP | |
# from src.submission.submit import add_new_eval | |
from src.display.formatting import make_clickable_field | |
# def restart_space(): | |
# API.restart_space(repo_id=REPO_ID) | |
# ### Space initialisation | |
# try: | |
# print(EVAL_REQUESTS_PATH) | |
# snapshot_download( | |
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN | |
# ) | |
# except Exception: | |
# restart_space() | |
# try: | |
# print(EVAL_RESULTS_PATH) | |
# snapshot_download( | |
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN | |
# ) | |
# except Exception: | |
# restart_space() | |
# ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS) | |
# AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS) | |
# ( | |
# finished_eval_queue_df, | |
# running_eval_queue_df, | |
# pending_eval_queue_df, | |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) | |
# def bold_max(s): | |
# is_max = s == s.max() # Boolean Series: True for the max value(s) | |
# return ['font-weight: bold' if v else '' for v in is_max] | |
# def init_leaderboard(df, benchmark_type): | |
# if df is None or df.empty: | |
# raise ValueError("Leaderboard DataFrame is empty or None.") | |
# non_task_cols = ["Model"] | |
# if benchmark_type == "agentic": | |
# # Include agent column | |
# non_task_cols.append("Agent") | |
# elif benchmark_type == "base": | |
# # Drop agent column | |
# dataframe = dataframe.drop(columns=["Agent"]) | |
# AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))] | |
# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]]) | |
# df.style.set_table_styles([ | |
# {'selector': 'th', 'props': [('text-align', 'center')]}, | |
# {'selector': 'td', 'props': [('text-align', 'center')]} | |
# ]) | |
# Define a common tooltip text | |
# tooltip_text = "This is the common tooltip" | |
# # Create a tooltip DataFrame with the same shape as df, | |
# # filled with the same tooltip text for each cell. | |
# tooltips = pd.DataFrame(tooltip_text, index=df.index, columns=df.columns) | |
# # Apply the tooltips to the DataFrame | |
# styled_df = df.style.set_tooltips(tooltips) | |
# return gr.components.Dataframe( | |
# value=df, | |
# datatype=[c.type for c in AutoEvalColumnSubset], | |
# column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset], | |
# wrap=False, | |
# ) | |
def build_leaderboard(type): | |
with open('data/results.json', 'r') as f: | |
results = json.load(f) | |
with open('data/tasks.json', 'r') as f: | |
tasks = json.load(f) | |
# Filter tasks based on type | |
filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type} | |
data = [] | |
for model_name, model_data in results.items(): | |
# For agentic type, skip models that have all null values for agentic tasks | |
if type == "agentic": | |
has_agentic_results = any( | |
model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None | |
for task in filtered_tasks | |
) | |
if not has_agentic_results: | |
continue | |
model_sha = model_data["config"]["model_sha"] | |
model_name = model_data["config"]["model_name"] | |
row = { | |
'Model': make_clickable_field(model_name, model_sha) | |
} | |
for dataset, metrics in model_data['results'].items(): | |
# Only include metrics for tasks of the specified type | |
if dataset in filtered_tasks: | |
value = next(iter(metrics.values())) | |
log_url = metrics.get('log_url') | |
# Use display name from tasks.json instead of raw dataset name | |
display_name = filtered_tasks[dataset]['display_name'] | |
# Round non-null values to 2 decimal places and make clickable if log_url exists | |
if value is not None: | |
value = round(value*100, 2) | |
if log_url: | |
value = make_clickable_field(value, log_url) | |
row[display_name] = value | |
data.append(row) | |
results_df = pd.DataFrame(data) | |
# Round all numeric columns to 2 decimal places | |
numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns | |
results_df[numeric_cols] = results_df[numeric_cols].round(2) | |
# Fill null values with "-" | |
results_df = results_df.fillna("--") | |
if type == "agentic": | |
# Include agent column as second column after Model | |
results_df.insert(1, 'Agent', '[Basic Agent](https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent)') | |
return gr.components.Dataframe( | |
value=results_df, | |
datatype=["html" for _ in results_df.columns], | |
column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns], | |
wrap=False, | |
) | |
black_logo_path = "src/assets/logo-icon-black.png" | |
white_logo_path = "src/assets/logo-icon-white.png" | |
demo = gr.Blocks( | |
css=custom_css, | |
js=custom_js, | |
theme=gr.themes.Default(primary_hue=gr.themes.colors.pink), | |
fill_height=True, | |
fill_width=True, | |
) | |
with demo: | |
gr.HTML(f""" | |
<div id="page-header"> | |
<div id="header-container"> | |
<div id="left-container"> | |
<img id="black-logo" src="/gradio_api/file={black_logo_path}"> | |
<img id="white-logo" src="/gradio_api/file={white_logo_path}"> | |
</div> | |
<div id="centre-container"> | |
<h1 style="margin-bottom: 0.25rem;">{TITLE}</h1> | |
<p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results & Traces</p> | |
</div> | |
<div id="right-container"> | |
</div> | |
</div> | |
</div> | |
""") | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False) | |
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs: | |
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0): | |
build_leaderboard("base") | |
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1): | |
build_leaderboard("agentic") | |
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2): | |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False) | |
with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3): | |
gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False) | |
assets = [black_logo_path, white_logo_path] | |
demo.launch() | |