Spaces:

qanta-challenge
/

leaderboard

Running

File size: 11,869 Bytes

# This file is kept for reference only and is not used in the enhanced implementation
# The actual implementation is in enhanced_leaderboard.py

import datetime
import json
import os

import pandas as pd
from loguru import logger

from src.envs import ADMIN_USERS, EVAL_RESULTS_PATH


def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
    model_results = []
    dirpath = os.path.join(repo_dir, competition_type, eval_split)
    for root, _, files in os.walk(dirpath):
        if len(files) == 0 or not all(f.endswith(".json") for f in files):
            continue
        for file in files:
            # Check if the file name is a valid submission id
            if not file.startswith(f"{competition_type}__"):
                continue
            filepath = os.path.join(root, file)
            try:
                with open(filepath, "r") as fp:
                    result = json.load(fp)
                model_results.append(result)
            except Exception as e:
                logger.error(f"Error loading model result from {filepath}: {e}")
                continue

    return model_results


def fetch_tossup_elo_results(repo_dir: str, eval_split: str) -> list[dict]:
    elo_results = []
    dirpath = os.path.join(repo_dir, "tossup", eval_split)
    filepath = os.path.join(dirpath, "elo_results.json")
    with open(filepath, "r") as fp:
        elo_results = json.load(fp)
    return elo_results


def get_submission_date(result: dict) -> datetime.date:
    submission_id = result["id"]
    datetime_str = submission_id.split("__")[-3]
    # str format is YYYYMMDD_HHMMSS in UTC. Convert to eastern time date
    datetime_obj = datetime.datetime.strptime(datetime_str, "%Y%m%d_%H%M%S")
    return datetime_obj.astimezone(datetime.timezone(datetime.timedelta(hours=-5))).date()


def qualify_for_private_observation(username: str, logged_in_username: str | None) -> bool:
    if not logged_in_username:
        return False
    if logged_in_username in ADMIN_USERS:
        return True
    if logged_in_username == username:
        return True
    return False


def get_tossups_leaderboard_df(
    repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
) -> pd.DataFrame:
    model_results = fetch_model_results(repo_dir, "tossup", eval_split)
    elo_results = fetch_tossup_elo_results(repo_dir, eval_split)

    eval_results = []
    for result in model_results:
        try:
            submission_id = result["id"]
            metrics = result["metrics"]
            username = result["username"]
            model_name = result["model_name"]
            submission_name = f"{username}/{model_name}"
            if cutoff_date and cutoff_date < get_submission_date(result):
                if not qualify_for_private_observation(username, logged_in_username):
                    continue
                submission_name = f"{username}/{model_name} (*)"
            e_score_ai = elo_results.get(submission_id, 0.0)
            overall_expected_score = 0.5 * (metrics["expected_score"] + e_score_ai)
            row = {
                "Submission": submission_name,
                "E [Score] ⬆️": overall_expected_score,
                "E [Score] (🙋🏻)": metrics["expected_score"],
                "E [Score] (🤖)": e_score_ai,
                "Cost ⬇️": result["cost"],
                "Buz Prec.": metrics["buzz_accuracy"],
                "Buz Freq.": metrics["buzz_frequency"],
                "Buzz Position": metrics["buzz_position"],
                "Win Rate w/ 🙋🏻": metrics.get("human_win_rate", None),
            }
            eval_results.append(row)
        except Exception as e:
            logger.error(f"Error processing model result for eval_split={eval_split} '{username}/{model_name}': {e}")
            continue

    df = pd.DataFrame(eval_results)
    df.sort_values(by="E [Score] ⬆️", ascending=False, inplace=True)
    return df


def get_bonuses_leaderboard_df(
    repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
) -> pd.DataFrame:
    model_results = fetch_model_results(repo_dir, "bonus", eval_split)

    eval_results = []
    for result in model_results:
        try:
            metrics = result["metrics"]
            username = result["username"]
            model_name = result["model_name"]
            submission_name = f"{username}/{model_name}"
            if cutoff_date and cutoff_date < get_submission_date(result):
                if not qualify_for_private_observation(username, logged_in_username):
                    continue
                submission_name = f"{username}/{model_name} (*)"

            row = {
                "Submission": submission_name,
                "Cost ⬇️": result["cost"],
                "Effect ⬆️": metrics["effectiveness"],
                "Part Acc": metrics["part_accuracy"],
                "Question Acc": metrics["question_accuracy"],
                "Calibration": metrics["calibration"],
                "Adoption": metrics["adoption"],
            }
            eval_results.append(row)
        except Exception as e:
            logger.exception(f"Error processing model result '{username}/{model_name}': {e}")
            continue

    df = pd.DataFrame(eval_results)
    df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True)
    return df


def colour_pos_neg(v):
    """Return a CSS rule for the cell that called the function."""
    if pd.isna(v):  # keep NaNs unstyled
        return ""
    return "color: green;" if float(v) > 0 else "color: red;"


def color_cost(v):
    if pd.isna(v):
        return ""
    # Bucket the cost into 5 categories with darker colors
    cost = float(v)
    if cost < 1:
        return "color: #006400;"  # dark green
    elif cost < 2:
        return "color: #00008b;"  # dark blue
    elif cost < 3:
        return "color: #8b8b00;"  # dark yellow
    elif cost < 4:
        return "color: #8b4500;"  # dark orange
    else:
        return "color: #8b0000;"  # dark red


# Helper function to bold the highest value in a column
def bold_max(s):
    is_max = s == s.max()
    return ["font-weight: bold" if v else "" for v in is_max]


def highlight_private_row(row):
    return ["background-color: lightblue" if row["Submission"].endswith("(*)") else "" for _ in row]


def fetch_tossup_leaderboard(
    split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
):
    df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)

    # Apply formatting and styling
    percent_cols = ["Buz Prec.", "Buz Freq.", "Win Rate w/ 🙋🏻"]
    float_cols = ["E [Score] ⬆️", "E [Score] (🙋🏻)", "E [Score] (🤖)", "Buzz Position"]
    styled_df = (
        df.style.format(
            {
                **dict.fromkeys(percent_cols, "{:>6.1%}"),
                **dict.fromkeys(float_cols, "{:6.3f}"),
                "Cost ⬇️": "${:,.2f}",
            }
        )
        .map(colour_pos_neg, subset=["E [Score] ⬆️", "E [Score] (🤖)", "E [Score] (🙋🏻)"])
        .map(color_cost, subset=["Cost ⬇️"])
        .apply(highlight_private_row, axis=1)
        .apply(
            bold_max,
            subset=[*percent_cols, *float_cols],
            axis=0,
        )
    )

    return styled_df if style else df


def fetch_bonus_leaderboard(
    split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
):
    df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)

    # Apply formatting and styling
    styled_df = (
        df.style.format(
            {
                "Question Acc": "{:>6.1%}",
                "Part Acc": "{:>6.1%}",
                "Effect ⬆️": "{:6.3f}",
                "Calibration": "{:>6.1%}",
                "Adoption": "{:>6.1%}",
                "Cost ⬇️": "${:,.2f}",
            }
        )
        .map(colour_pos_neg, subset=["Effect ⬆️"])
        .map(color_cost, subset=["Cost ⬇️"])
        .apply(highlight_private_row, axis=1)
        .apply(
            bold_max,
            subset=["Effect ⬆️", "Question Acc", "Part Acc", "Calibration", "Adoption"],
            axis=0,
        )
    )

    return styled_df if style else df


# TODO: Implement this once we have the proxy server running.
def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
    # Helper to extract username from 'Submission' (format: username/model_name)
    def extract_username(submission: str) -> str:
        username = submission.split("/", 1)[0] if "/" in submission else submission
        if submission.endswith(" (*)"):
            username = username + " (*)"
        return username

    # Add username columns
    tossup_df = tossup_df.copy()
    tossup_df["Username"] = tossup_df["Submission"].apply(extract_username)
    bonus_df = bonus_df.copy()
    bonus_df["Username"] = bonus_df["Submission"].apply(extract_username)

    # Pick best tossup per user (highest Expected Score ⬆️)
    tossup_best = tossup_df.sort_values("E [Score] ⬆️", ascending=False).drop_duplicates("Username")
    tossup_best = tossup_best.set_index("Username")

    # Pick best bonus per user (highest Effect ⬆️)
    bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username")
    bonus_best = bonus_best.set_index("Username")

    # Merge on Username (outer join to include users who have only one type)
    merged = pd.merge(
        tossup_best,
        bonus_best,
        left_index=True,
        right_index=True,
        how="outer",
        suffixes=("_tossup", "_bonus"),
    )

    # Compose a summary row per user
    # Columns: Username, Tossup Submission, Bonus Submission, all metrics from both
    leaderboard = pd.DataFrame(
        {
            "Username": merged.index,
            "Tossup Submission": merged["Submission_tossup"].str.split("/").str[1],
            "Bonus Submission": merged["Submission_bonus"].str.split("/").str[1],
            "Overall Score ⬆️": merged[["E [Score] ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1),
            "Tossup Score ⬆️": merged["E [Score] ⬆️"],
            "Bonus Effect ⬆️": merged["Effect ⬆️"],
            "Bonus Part Acc": merged["Part Acc"],
            "Bonus Adoption": merged["Adoption"],
        }
    )

    leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False)

    return leaderboard.reset_index(drop=True)


def highlight_overall_row(row):
    return ["background-color: lightblue" if row["Username"].endswith("(*)") else "" for _ in row]


def fetch_overall_leaderboard(
    split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
):
    bonus_df = fetch_bonus_leaderboard(split, style=False, date=date, username=username)
    tossup_df = fetch_tossup_leaderboard(split, style=False, date=date, username=username)
    overall_df = create_overall_leaderboard(tossup_df, bonus_df)

    # Apply formatting and styling
    styled_df = (
        overall_df.style.format(
            {
                "Overall Score ⬆️": "{:6.3f}",
                "Tossup Score ⬆️": "{:6.3f}",
                "Bonus Effect ⬆️": "{:6.3f}",
                "Bonus Part Acc": "{:>6.1%}",
                "Bonus Adoption": "{:>6.1%}",
            },
            na_rep="-",
        )
        .map(colour_pos_neg, subset=["Overall Score ⬆️"])
        .apply(highlight_overall_row, axis=1)
        .apply(
            bold_max,
            subset=["Overall Score ⬆️", "Tossup Score ⬆️", "Bonus Effect ⬆️"],
            axis=0,
        )
    )

    return styled_df if style else overall_df