import sys
from datetime import datetime
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from gradio_leaderboard import Leaderboard
from huggingface_hub import snapshot_download
from loguru import logger
from src.about import (
INTRODUCTION_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.envs import (
API,
COMPETITION_URL,
CUTOFF_DATES,
EVAL_RESULTS_PATH,
EVAL_SPLITS,
LEADERBOARD_REFRESH_INTERVAL,
REGISTRATION_URL,
REPO_ID,
RESULTS_REPO,
SUBMISSION_URL,
TOKEN,
)
from src.hf_dataset_utils import download_dataset_snapshot
from src.populate import (
fetch_bonus_leaderboard,
fetch_overall_leaderboard,
fetch_tossup_leaderboard,
)
logger.remove()
logger.add(sys.stderr, level="INFO", backtrace=True, diagnose=False)
# Load metrics manual content
def load_metrics_manual():
try:
with open("metrics_manual.md", "r") as f:
return f.read()
except Exception as e:
logger.error(f"Error loading metrics manual: {e}")
return "# Metrics Manual\n\nCould not load metrics manual content."
def restart_space():
API.restart_space(repo_id=REPO_ID)
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN,
)
except Exception:
restart_space()
def refresh_leaderboard(
split: str = "tiny_eval",
style: bool = True,
date: datetime.date = None,
profile: gr.OAuthProfile = None,
):
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
try:
username = profile and profile.username
except Exception:
# If the user is not logged in, profile will be None
username = None
tossup_df = fetch_tossup_leaderboard(split, style, date, username)
bonus_df = fetch_bonus_leaderboard(split, style, date, username)
overall_df = fetch_overall_leaderboard(split, style, date, username)
return tossup_df, bonus_df, overall_df
def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None):
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=True, date=date)
gr.HTML(
"
"
"âšī¸ E [Score] is the Expected Score for a question. đđģ and đ¤ indicate the scores against just the Human and the AI players respectively.
"
"âšī¸ Cost is the cost in USD of executing the pipeline per question prefix. (Typically we have upto ~20 prefixes per tossup question)"
"âšī¸ When does the cost matter? When two models buzz at the same token, which they often do, a lighter (cost-effective) model takes precedence.
"
"
"
)
tossup_leaderboard = gr.Dataframe(
value=tossup_df,
show_search=True,
label=" đī¸ Tossup Round Leaderboard",
show_label=True,
datatype=["str", "number", "number", "number", "number", "number", "number"],
elem_id="tossup-table",
interactive=False, # Ensure it's not interactive
)
gr.HTML(
""
"âšī¸ Cost for Bonus pipeline is the cost in USD of executing the pipeline per bonus part. (We have exactly 3 parts per bonus question)"
"
"
)
bonus_leaderboard = gr.Dataframe(
value=bonus_df,
show_search=True,
label=" đ§ Bonus Round Leaderboard",
show_label=True,
datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number"],
elem_id="bonus-table",
interactive=False, # Ensure it's not interactive
)
overall_leaderboard = gr.Dataframe(
value=overall_df,
show_search=True,
label=" đĨ Overall Leaderboard",
show_label=True,
datatype=["str", "str", "str", "number", "number", "number", "number", "number", "number"],
)
gr.on(
triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
fn=refresh_leaderboard,
inputs=[gr.State(split), gr.State(True), gr.State(date)],
outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
)
with gr.Blocks(css=custom_css) as demo:
gr.HTML(TITLE)
with gr.Row():
with gr.Column(scale=5):
gr.Markdown(
f"## đ Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n"
f"## đ˛ Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).",
elem_classes="welcome-text",
)
logged_note = gr.Markdown(
"## đ **Note:** Rows in blue with **(*)** are your submissions past the cutoff date and are only visible to you.",
visible=False,
)
with gr.Column(scale=2):
beautify_date = datetime.strptime(CUTOFF_DATES["Week 2"], "%Y-%m-%d").strftime("%B %d, %Y")
gr.Markdown(f"## đ
Next Cutoff Date: {beautify_date}")
gr.LoginButton("Login to privately view your scores on past weeks.")
refresh_btn = gr.Button("đ Refresh")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
for i, (name, split) in enumerate(EVAL_SPLITS.items()):
with gr.TabItem(f"đ
{name}", elem_id="llm-benchmark-tab-table", id=i):
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
cutoff_date = CUTOFF_DATES[name]
date = datetime.strptime(cutoff_date, "%Y-%m-%d").date()
create_leaderboard_interface(demo, refresh_btn, split, date)
# Add the Metrics Guide tab
with gr.TabItem("đ Metrics Guide", elem_id="metrics-guide-tab"):
gr.Markdown(load_metrics_manual())
def check_user_logged_in(x: gr.OAuthProfile | None = None):
return gr.update(visible=x is not None)
demo.load(check_user_logged_in, outputs=[logged_note])
# scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=1800)
# scheduler.start()
demo.queue(default_concurrency_limit=40).launch()