import sys from datetime import datetime import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from gradio_leaderboard import Leaderboard from huggingface_hub import snapshot_download from loguru import logger from src.about import ( INTRODUCTION_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.envs import ( API, COMPETITION_URL, CUTOFF_DATES, EVAL_RESULTS_PATH, EVAL_SPLITS, LEADERBOARD_REFRESH_INTERVAL, REGISTRATION_URL, REPO_ID, RESULTS_REPO, SUBMISSION_URL, TOKEN, ) from src.hf_dataset_utils import download_dataset_snapshot from src.populate import ( fetch_bonus_leaderboard, fetch_overall_leaderboard, fetch_tossup_leaderboard, ) logger.remove() logger.add(sys.stderr, level="INFO", backtrace=True, diagnose=False) # Load metrics manual content def load_metrics_manual(): try: with open("metrics_manual.md", "r") as f: return f.read() except Exception as e: logger.error(f"Error loading metrics manual: {e}") return "# Metrics Manual\n\nCould not load metrics manual content." def restart_space(): API.restart_space(repo_id=REPO_ID) try: print(EVAL_RESULTS_PATH) snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, ) except Exception: restart_space() def refresh_leaderboard( split: str = "tiny_eval", style: bool = True, date: datetime.date = None, profile: gr.OAuthProfile = None, ): download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) try: username = profile and profile.username except Exception: # If the user is not logged in, profile will be None username = None tossup_df = fetch_tossup_leaderboard(split, style, date, username) bonus_df = fetch_bonus_leaderboard(split, style, date, username) overall_df = fetch_overall_leaderboard(split, style, date, username) return tossup_df, bonus_df, overall_df def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None): leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL) tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=True, date=date) gr.HTML( "
" "â„šī¸ E [Score] is the Expected Score for a question. 🙋đŸģ and 🤖 indicate the scores against just the Human and the AI players respectively.
" "â„šī¸ Cost is the cost in USD of executing the pipeline per question prefix. (Typically we have upto ~20 prefixes per tossup question)" "â„šī¸ When does the cost matter? When two models buzz at the same token, which they often do, a lighter (cost-effective) model takes precedence.
" "
" ) tossup_leaderboard = gr.Dataframe( value=tossup_df, show_search=True, label=" đŸ›Žī¸ Tossup Round Leaderboard", show_label=True, datatype=["str", "number", "number", "number", "number", "number", "number"], elem_id="tossup-table", interactive=False, # Ensure it's not interactive ) gr.HTML( "
" "â„šī¸ Cost for Bonus pipeline is the cost in USD of executing the pipeline per bonus part. (We have exactly 3 parts per bonus question)" "
" ) bonus_leaderboard = gr.Dataframe( value=bonus_df, show_search=True, label=" 🧐 Bonus Round Leaderboard", show_label=True, datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number"], elem_id="bonus-table", interactive=False, # Ensure it's not interactive ) overall_leaderboard = gr.Dataframe( value=overall_df, show_search=True, label=" đŸĨ‡ Overall Leaderboard", show_label=True, datatype=["str", "str", "str", "number", "number", "number", "number", "number", "number"], ) gr.on( triggers=[leaderboard_timer.tick, refresh_btn.click, app.load], fn=refresh_leaderboard, inputs=[gr.State(split), gr.State(True), gr.State(date)], outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard], ) with gr.Blocks(css=custom_css) as demo: gr.HTML(TITLE) with gr.Row(): with gr.Column(scale=5): gr.Markdown( f"## 📋 Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n" f"## 🎲 Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).", elem_classes="welcome-text", ) logged_note = gr.Markdown( "## 👉 **Note:** Rows in blue with **(*)** are your submissions past the cutoff date and are only visible to you.", visible=False, ) with gr.Column(scale=2): beautify_date = datetime.strptime(CUTOFF_DATES["Week 2"], "%Y-%m-%d").strftime("%B %d, %Y") gr.Markdown(f"## 📅 Next Cutoff Date:    {beautify_date}") gr.LoginButton("Login to privately view your scores on past weeks.") refresh_btn = gr.Button("🔄 Refresh") with gr.Tabs(elem_classes="tab-buttons") as tabs: for i, (name, split) in enumerate(EVAL_SPLITS.items()): with gr.TabItem(f"🏅 {name}", elem_id="llm-benchmark-tab-table", id=i): leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL) cutoff_date = CUTOFF_DATES[name] date = datetime.strptime(cutoff_date, "%Y-%m-%d").date() create_leaderboard_interface(demo, refresh_btn, split, date) # Add the Metrics Guide tab with gr.TabItem("📊 Metrics Guide", elem_id="metrics-guide-tab"): gr.Markdown(load_metrics_manual()) def check_user_logged_in(x: gr.OAuthProfile | None = None): return gr.update(visible=x is not None) demo.load(check_user_logged_in, outputs=[logged_note]) # scheduler = BackgroundScheduler() # scheduler.add_job(restart_space, "interval", seconds=1800) # scheduler.start() demo.queue(default_concurrency_limit=40).launch()