# This file is kept for reference only and is not used in the enhanced implementation # The actual implementation is in enhanced_leaderboard.py import datetime import json import os import pandas as pd from loguru import logger from src.envs import ADMIN_USERS, EVAL_RESULTS_PATH def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]: model_results = [] dirpath = os.path.join(repo_dir, competition_type, eval_split) for root, _, files in os.walk(dirpath): if len(files) == 0 or not all(f.endswith(".json") for f in files): continue for file in files: # Check if the file name is a valid submission id if not file.startswith(f"{competition_type}__"): continue filepath = os.path.join(root, file) try: with open(filepath, "r") as fp: result = json.load(fp) model_results.append(result) except Exception as e: logger.error(f"Error loading model result from {filepath}: {e}") continue return model_results def fetch_tossup_elo_results(repo_dir: str, eval_split: str) -> list[dict]: elo_results = [] dirpath = os.path.join(repo_dir, "tossup", eval_split) filepath = os.path.join(dirpath, "elo_results.json") with open(filepath, "r") as fp: elo_results = json.load(fp) return elo_results def get_submission_date(result: dict) -> datetime.date: submission_id = result["id"] datetime_str = submission_id.split("__")[-3] # str format is YYYYMMDD_HHMMSS in UTC. Convert to eastern time date datetime_obj = datetime.datetime.strptime(datetime_str, "%Y%m%d_%H%M%S") return datetime_obj.astimezone(datetime.timezone(datetime.timedelta(hours=-5))).date() def qualify_for_private_observation(username: str, logged_in_username: str | None) -> bool: if not logged_in_username: return False if logged_in_username in ADMIN_USERS: return True if logged_in_username == username: return True return False def get_tossups_leaderboard_df( repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None ) -> pd.DataFrame: model_results = fetch_model_results(repo_dir, "tossup", eval_split) elo_results = fetch_tossup_elo_results(repo_dir, eval_split) eval_results = [] for result in model_results: try: submission_id = result["id"] metrics = result["metrics"] username = result["username"] model_name = result["model_name"] submission_name = f"{username}/{model_name}" if cutoff_date and cutoff_date < get_submission_date(result): if not qualify_for_private_observation(username, logged_in_username): continue submission_name = f"{username}/{model_name} (*)" e_score_ai = elo_results.get(submission_id, 0.0) overall_expected_score = 0.5 * (metrics["expected_score"] + e_score_ai) row = { "Submission": submission_name, "E [Score] ⬆️": overall_expected_score, "E [Score] (🙋🏻)": metrics["expected_score"], "E [Score] (🤖)": e_score_ai, "Cost ⬇️": result["cost"], "Buz Prec.": metrics["buzz_accuracy"], "Buz Freq.": metrics["buzz_frequency"], "Buzz Position": metrics["buzz_position"], "Win Rate w/ 🙋🏻": metrics.get("human_win_rate", None), } eval_results.append(row) except Exception as e: logger.error(f"Error processing model result for eval_split={eval_split} '{username}/{model_name}': {e}") continue df = pd.DataFrame(eval_results) df.sort_values(by="E [Score] ⬆️", ascending=False, inplace=True) return df def get_bonuses_leaderboard_df( repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None ) -> pd.DataFrame: model_results = fetch_model_results(repo_dir, "bonus", eval_split) eval_results = [] for result in model_results: try: metrics = result["metrics"] username = result["username"] model_name = result["model_name"] submission_name = f"{username}/{model_name}" if cutoff_date and cutoff_date < get_submission_date(result): if not qualify_for_private_observation(username, logged_in_username): continue submission_name = f"{username}/{model_name} (*)" row = { "Submission": submission_name, "Cost ⬇️": result["cost"], "Effect ⬆️": metrics["effectiveness"], "Part Acc": metrics["part_accuracy"], "Question Acc": metrics["question_accuracy"], "Calibration": metrics["calibration"], "Adoption": metrics["adoption"], } eval_results.append(row) except Exception as e: logger.exception(f"Error processing model result '{username}/{model_name}': {e}") continue df = pd.DataFrame(eval_results) df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True) return df def colour_pos_neg(v): """Return a CSS rule for the cell that called the function.""" if pd.isna(v): # keep NaNs unstyled return "" return "color: green;" if float(v) > 0 else "color: red;" def color_cost(v): if pd.isna(v): return "" # Bucket the cost into 5 categories with darker colors cost = float(v) if cost < 1: return "color: #006400;" # dark green elif cost < 2: return "color: #00008b;" # dark blue elif cost < 3: return "color: #8b8b00;" # dark yellow elif cost < 4: return "color: #8b4500;" # dark orange else: return "color: #8b0000;" # dark red # Helper function to bold the highest value in a column def bold_max(s): is_max = s == s.max() return ["font-weight: bold" if v else "" for v in is_max] def highlight_private_row(row): return ["background-color: lightblue" if row["Submission"].endswith("(*)") else "" for _ in row] def fetch_tossup_leaderboard( split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None ): df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username) # Apply formatting and styling percent_cols = ["Buz Prec.", "Buz Freq.", "Win Rate w/ 🙋🏻"] float_cols = ["E [Score] ⬆️", "E [Score] (🙋🏻)", "E [Score] (🤖)", "Buzz Position"] styled_df = ( df.style.format( { **dict.fromkeys(percent_cols, "{:>6.1%}"), **dict.fromkeys(float_cols, "{:6.3f}"), "Cost ⬇️": "${:,.2f}", } ) .map(colour_pos_neg, subset=["E [Score] ⬆️", "E [Score] (🤖)", "E [Score] (🙋🏻)"]) .map(color_cost, subset=["Cost ⬇️"]) .apply(highlight_private_row, axis=1) .apply( bold_max, subset=[*percent_cols, *float_cols], axis=0, ) ) return styled_df if style else df def fetch_bonus_leaderboard( split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None ): df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split, date, username) # Apply formatting and styling styled_df = ( df.style.format( { "Question Acc": "{:>6.1%}", "Part Acc": "{:>6.1%}", "Effect ⬆️": "{:6.3f}", "Calibration": "{:>6.1%}", "Adoption": "{:>6.1%}", "Cost ⬇️": "${:,.2f}", } ) .map(colour_pos_neg, subset=["Effect ⬆️"]) .map(color_cost, subset=["Cost ⬇️"]) .apply(highlight_private_row, axis=1) .apply( bold_max, subset=["Effect ⬆️", "Question Acc", "Part Acc", "Calibration", "Adoption"], axis=0, ) ) return styled_df if style else df # TODO: Implement this once we have the proxy server running. def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame: # Helper to extract username from 'Submission' (format: username/model_name) def extract_username(submission: str) -> str: username = submission.split("/", 1)[0] if "/" in submission else submission if submission.endswith(" (*)"): username = username + " (*)" return username # Add username columns tossup_df = tossup_df.copy() tossup_df["Username"] = tossup_df["Submission"].apply(extract_username) bonus_df = bonus_df.copy() bonus_df["Username"] = bonus_df["Submission"].apply(extract_username) # Pick best tossup per user (highest Expected Score ⬆️) tossup_best = tossup_df.sort_values("E [Score] ⬆️", ascending=False).drop_duplicates("Username") tossup_best = tossup_best.set_index("Username") # Pick best bonus per user (highest Effect ⬆️) bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username") bonus_best = bonus_best.set_index("Username") # Merge on Username (outer join to include users who have only one type) merged = pd.merge( tossup_best, bonus_best, left_index=True, right_index=True, how="outer", suffixes=("_tossup", "_bonus"), ) # Compose a summary row per user # Columns: Username, Tossup Submission, Bonus Submission, all metrics from both leaderboard = pd.DataFrame( { "Username": merged.index, "Tossup Submission": merged["Submission_tossup"].str.split("/").str[1], "Bonus Submission": merged["Submission_bonus"].str.split("/").str[1], "Overall Score ⬆️": merged[["E [Score] ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1), "Tossup Score ⬆️": merged["E [Score] ⬆️"], "Bonus Effect ⬆️": merged["Effect ⬆️"], "Bonus Part Acc": merged["Part Acc"], "Bonus Adoption": merged["Adoption"], } ) leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False) return leaderboard.reset_index(drop=True) def highlight_overall_row(row): return ["background-color: lightblue" if row["Username"].endswith("(*)") else "" for _ in row] def fetch_overall_leaderboard( split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None ): bonus_df = fetch_bonus_leaderboard(split, style=False, date=date, username=username) tossup_df = fetch_tossup_leaderboard(split, style=False, date=date, username=username) overall_df = create_overall_leaderboard(tossup_df, bonus_df) # Apply formatting and styling styled_df = ( overall_df.style.format( { "Overall Score ⬆️": "{:6.3f}", "Tossup Score ⬆️": "{:6.3f}", "Bonus Effect ⬆️": "{:6.3f}", "Bonus Part Acc": "{:>6.1%}", "Bonus Adoption": "{:>6.1%}", }, na_rep="-", ) .map(colour_pos_neg, subset=["Overall Score ⬆️"]) .apply(highlight_overall_row, axis=1) .apply( bold_max, subset=["Overall Score ⬆️", "Tossup Score ⬆️", "Bonus Effect ⬆️"], axis=0, ) ) return styled_df if style else overall_df