Spaces:
Running
Running
import sys | |
from datetime import datetime | |
import gradio as gr | |
import pandas as pd | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from gradio_leaderboard import Leaderboard | |
from huggingface_hub import snapshot_download | |
from loguru import logger | |
from src.about import ( | |
INTRODUCTION_TEXT, | |
TITLE, | |
) | |
from src.display.css_html_js import custom_css | |
from src.envs import ( | |
API, | |
COMPETITION_URL, | |
CUTOFF_DATES, | |
EVAL_RESULTS_PATH, | |
EVAL_SPLITS, | |
LEADERBOARD_REFRESH_INTERVAL, | |
REGISTRATION_URL, | |
REPO_ID, | |
RESULTS_REPO, | |
SUBMISSION_URL, | |
TOKEN, | |
) | |
from src.hf_dataset_utils import download_dataset_snapshot | |
from src.populate import ( | |
fetch_bonus_leaderboard, | |
fetch_overall_leaderboard, | |
fetch_tossup_leaderboard, | |
) | |
logger.remove() | |
logger.add(sys.stderr, level="INFO", backtrace=True, diagnose=False) | |
# Load metrics manual content | |
def load_metrics_manual(): | |
try: | |
with open("metrics_manual.md", "r") as f: | |
return f.read() | |
except Exception as e: | |
logger.error(f"Error loading metrics manual: {e}") | |
return "# Metrics Manual\n\nCould not load metrics manual content." | |
def restart_space(): | |
API.restart_space(repo_id=REPO_ID) | |
try: | |
print(EVAL_RESULTS_PATH) | |
snapshot_download( | |
repo_id=RESULTS_REPO, | |
local_dir=EVAL_RESULTS_PATH, | |
repo_type="dataset", | |
tqdm_class=None, | |
etag_timeout=30, | |
token=TOKEN, | |
) | |
except Exception: | |
restart_space() | |
def refresh_leaderboard( | |
split: str = "tiny_eval", | |
style: bool = True, | |
date: datetime.date = None, | |
profile: gr.OAuthProfile = None, | |
): | |
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) | |
try: | |
username = profile and profile.username | |
except Exception: | |
# If the user is not logged in, profile will be None | |
username = None | |
tossup_df = fetch_tossup_leaderboard(split, style, date, username) | |
bonus_df = fetch_bonus_leaderboard(split, style, date, username) | |
overall_df = fetch_overall_leaderboard(split, style, date, username) | |
return tossup_df, bonus_df, overall_df | |
def create_leaderboard_interface(app, refresh_btn, split: str = "tiny_eval", date: datetime.date = None): | |
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL) | |
tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=True, date=date) | |
gr.HTML( | |
"<div style='font-size: 18px;'>" | |
"βΉοΈ <b>E [Score]</b> is the <b>Expected Score</b> for a question. ππ» and π€ indicate the scores against just the Human and the AI players respectively.<br>" | |
"βΉοΈ <b>Cost</b> is the cost in USD of executing the pipeline <b>per question prefix</b>. (Typically we have upto ~20 prefixes per tossup question)" | |
"βΉοΈ <b>When does the cost matter?</b> When two models buzz at the same token, which they often do, a lighter (cost-effective) model takes precedence.<br>" | |
"</div>" | |
) | |
tossup_leaderboard = gr.Dataframe( | |
value=tossup_df, | |
show_search=True, | |
label=" ποΈ Tossup Round Leaderboard", | |
show_label=True, | |
datatype=["str", "number", "number", "number", "number", "number", "number"], | |
elem_id="tossup-table", | |
interactive=False, # Ensure it's not interactive | |
) | |
gr.HTML( | |
"<div style='font-size: 18px;'>" | |
"βΉοΈ <b>Cost for Bonus pipeline</b> is the cost in USD of executing the pipeline <b>per bonus part</b>. (We have exactly 3 parts per bonus question)" | |
"</div>" | |
) | |
bonus_leaderboard = gr.Dataframe( | |
value=bonus_df, | |
show_search=True, | |
label=" π§ Bonus Round Leaderboard", | |
show_label=True, | |
datatype=["str", "number", "number", "number", "number", "number", "number", "number", "number"], | |
elem_id="bonus-table", | |
interactive=False, # Ensure it's not interactive | |
) | |
overall_leaderboard = gr.Dataframe( | |
value=overall_df, | |
show_search=True, | |
label=" π₯ Overall Leaderboard", | |
show_label=True, | |
datatype=["str", "str", "str", "number", "number", "number", "number", "number", "number"], | |
) | |
gr.on( | |
triggers=[leaderboard_timer.tick, refresh_btn.click, app.load], | |
fn=refresh_leaderboard, | |
inputs=[gr.State(split), gr.State(True), gr.State(date)], | |
outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard], | |
) | |
with gr.Blocks(css=custom_css) as demo: | |
gr.HTML(TITLE) | |
with gr.Row(): | |
with gr.Column(scale=5): | |
gr.Markdown( | |
f"## π Register [here]({REGISTRATION_URL}) to participate in our [Human-AI Cooperative Trivia Competition]({COMPETITION_URL}).\n" | |
f"## π² Create and submit your quizbowl AI agents at our [submission site]({SUBMISSION_URL}).", | |
elem_classes="welcome-text", | |
) | |
logged_note = gr.Markdown( | |
"## π **Note:** <span style='background-color: lightblue; padding: 10px; margin:4px'>Rows in blue with **(*)**</span> are your submissions past the cutoff date and are only visible to you.", | |
visible=False, | |
) | |
with gr.Column(scale=2): | |
beautify_date = datetime.strptime(CUTOFF_DATES["Week 2"], "%Y-%m-%d").strftime("%B %d, %Y") | |
gr.Markdown(f"## π Next Cutoff Date: <span style='color:crimson'>{beautify_date}</span>") | |
gr.LoginButton("Login to privately view your scores on past weeks.") | |
refresh_btn = gr.Button("π Refresh") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
for i, (name, split) in enumerate(EVAL_SPLITS.items()): | |
with gr.TabItem(f"π {name}", elem_id="llm-benchmark-tab-table", id=i): | |
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL) | |
cutoff_date = CUTOFF_DATES[name] | |
date = datetime.strptime(cutoff_date, "%Y-%m-%d").date() | |
create_leaderboard_interface(demo, refresh_btn, split, date) | |
# Add the Metrics Guide tab | |
with gr.TabItem("π Metrics Guide", elem_id="metrics-guide-tab"): | |
gr.Markdown(load_metrics_manual()) | |
def check_user_logged_in(x: gr.OAuthProfile | None = None): | |
return gr.update(visible=x is not None) | |
demo.load(check_user_logged_in, outputs=[logged_note]) | |
# scheduler = BackgroundScheduler() | |
# scheduler.add_job(restart_space, "interval", seconds=1800) | |
# scheduler.start() | |
demo.queue(default_concurrency_limit=40).launch() | |