leaderboard / app.py
Maharshi Gor
Cost information and model-model comparison
025f1f3
import sys
from datetime import datetime
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from gradio_leaderboard import Leaderboard
from huggingface_hub import snapshot_download
from loguru import logger
from src.about import (
INTRODUCTION_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.envs import (
API,
COMPETITION_URL,
CUTOFF_DATES,
EVAL_RESULTS_PATH,
EVAL_SPLITS,
LEADERBOARD_REFRESH_INTERVAL,
REGISTRATION_URL,
REPO_ID,
RESULTS_REPO,
SUBMISSION_URL,
TOKEN,
)
from src.hf_dataset_utils import download_dataset_snapshot
from src.populate import (
fetch_bonus_leaderboard,
fetch_overall_leaderboard,
fetch_tossup_leaderboard,
)
logger.remove()
logger.add(sys.stderr, level="INFO", backtrace=True, diagnose=False)
# Load metrics manual content
def load_metrics_manual():
try:
with open("metrics_manual.md", "r") as f:
return f.read()
except Exception as e:
logger.error(f"Error loading metrics manual: {e}")
return "# Metrics Manual\n\nCould not load metrics manual content."
def restart_space():
API.restart_space(repo_id=REPO_ID)
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN,
)
except Exception:
restart_space()
def refresh_leaderboard(
split: str = "tiny_eval",
style: bool = True,
date: datetime.date = None,
profile: gr.OAuthProfile = None,
):
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
try:
username = profile and profile.username
except Exception:
# If the user is not logged in, profile will be None
username = None
tossup_df = fetch_tossup_leaderboard(split, style, date, username)
bonus_df = fetch_bonus_leaderboard(split, style, date, username)
overall_df = fetch_overall_leaderboard(split, style, date, username)
return tossup_df, bonus_df, overall_df
def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None):
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=True, date=date)
gr.HTML(
"<div style='font-size: 18px;'>"
"ℹ️ <b>E [Score]</b> is the <b>Expected Score</b> for a question. πŸ™‹πŸ» and πŸ€– indicate the scores against just the Human and the AI players respectively.<br>"
"ℹ️ <b>Cost</b> is the cost in USD of executing the pipeline <b>per question prefix</b>. (Typically we have upto ~20 prefixes per tossup question)"
"ℹ️ <b>When does the cost matter?</b> When two models buzz at the same token, which they often do, a lighter (cost-effective) model takes precedence.<br>"
"</div>"
)
tossup_leaderboard = gr.Dataframe(
value=tossup_df,
show_search=True,
label=" πŸ›ŽοΈ Tossup Round Leaderboard",
show_label=True,
datatype=["str", "number", "number", "number", "number", "number", "number"],
elem_id="tossup-table",
interactive=False, # Ensure it's not interactive
)
gr.HTML(
"<div style='font-size: 18px;'>"
"ℹ️ <b>Cost for Bonus pipeline</b> is the cost in USD of executing the pipeline <b>per bonus part</b>. (We have exactly 3 parts per bonus question)"
"</div>"
)
bonus_leaderboard = gr.Dataframe(
value=bonus_df,
show_search=True,
label=" 🧐 Bonus Round Leaderboard",
show_label=True,
datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number"],
elem_id="bonus-table",
interactive=False, # Ensure it's not interactive
)
overall_leaderboard = gr.Dataframe(
value=overall_df,
show_search=True,
label=" πŸ₯‡ Overall Leaderboard",
show_label=True,
datatype=["str", "str", "str", "number", "number", "number", "number", "number", "number"],
)
gr.on(
triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
fn=refresh_leaderboard,
inputs=[gr.State(split), gr.State(True), gr.State(date)],
outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
)
with gr.Blocks(css=custom_css) as demo:
gr.HTML(TITLE)
with gr.Row():
with gr.Column(scale=5):
gr.Markdown(
f"## πŸ“‹ Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n"
f"## 🎲 Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).",
elem_classes="welcome-text",
)
logged_note = gr.Markdown(
"## πŸ‘‰ **Note:** <span style='background-color: lightblue; padding: 10px; margin:4px'>Rows in blue with **(*)**</span> are your submissions past the cutoff date and are only visible to you.",
visible=False,
)
with gr.Column(scale=2):
beautify_date = datetime.strptime(CUTOFF_DATES["Week 2"], "%Y-%m-%d").strftime("%B %d, %Y")
gr.Markdown(f"## πŸ“… Next Cutoff Date: &nbsp;&nbsp; <span style='color:crimson'>{beautify_date}</span>")
gr.LoginButton("Login to privately view your scores on past weeks.")
refresh_btn = gr.Button("πŸ”„ Refresh")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
for i, (name, split) in enumerate(EVAL_SPLITS.items()):
with gr.TabItem(f"πŸ… {name}", elem_id="llm-benchmark-tab-table", id=i):
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
cutoff_date = CUTOFF_DATES[name]
date = datetime.strptime(cutoff_date, "%Y-%m-%d").date()
create_leaderboard_interface(demo, refresh_btn, split, date)
# Add the Metrics Guide tab
with gr.TabItem("πŸ“Š Metrics Guide", elem_id="metrics-guide-tab"):
gr.Markdown(load_metrics_manual())
def check_user_logged_in(x: gr.OAuthProfile | None = None):
return gr.update(visible=x is not None)
demo.load(check_user_logged_in, outputs=[logged_note])
# scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=1800)
# scheduler.start()
demo.queue(default_concurrency_limit=40).launch()