Spaces:
Running
Running
# This file is kept for reference only and is not used in the enhanced implementation | |
# The actual implementation is in enhanced_leaderboard.py | |
import datetime | |
import json | |
import os | |
import pandas as pd | |
from loguru import logger | |
from src.envs import ADMIN_USERS, EVAL_RESULTS_PATH | |
def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]: | |
model_results = [] | |
dirpath = os.path.join(repo_dir, competition_type, eval_split) | |
for root, _, files in os.walk(dirpath): | |
if len(files) == 0 or not all(f.endswith(".json") for f in files): | |
continue | |
for file in files: | |
# Check if the file name is a valid submission id | |
if not file.startswith(f"{competition_type}__"): | |
continue | |
filepath = os.path.join(root, file) | |
try: | |
with open(filepath, "r") as fp: | |
result = json.load(fp) | |
model_results.append(result) | |
except Exception as e: | |
logger.error(f"Error loading model result from {filepath}: {e}") | |
continue | |
return model_results | |
def fetch_tossup_elo_results(repo_dir: str, eval_split: str) -> list[dict]: | |
elo_results = [] | |
dirpath = os.path.join(repo_dir, "tossup", eval_split) | |
filepath = os.path.join(dirpath, "elo_results.json") | |
with open(filepath, "r") as fp: | |
elo_results = json.load(fp) | |
return elo_results | |
def get_submission_date(result: dict) -> datetime.date: | |
submission_id = result["id"] | |
datetime_str = submission_id.split("__")[-3] | |
# str format is YYYYMMDD_HHMMSS in UTC. Convert to eastern time date | |
datetime_obj = datetime.datetime.strptime(datetime_str, "%Y%m%d_%H%M%S") | |
return datetime_obj.astimezone(datetime.timezone(datetime.timedelta(hours=-5))).date() | |
def qualify_for_private_observation(username: str, logged_in_username: str | None) -> bool: | |
if not logged_in_username: | |
return False | |
if logged_in_username in ADMIN_USERS: | |
return True | |
if logged_in_username == username: | |
return True | |
return False | |
def get_tossups_leaderboard_df( | |
repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None | |
) -> pd.DataFrame: | |
model_results = fetch_model_results(repo_dir, "tossup", eval_split) | |
elo_results = fetch_tossup_elo_results(repo_dir, eval_split) | |
eval_results = [] | |
for result in model_results: | |
try: | |
submission_id = result["id"] | |
metrics = result["metrics"] | |
username = result["username"] | |
model_name = result["model_name"] | |
submission_name = f"{username}/{model_name}" | |
if cutoff_date and cutoff_date < get_submission_date(result): | |
if not qualify_for_private_observation(username, logged_in_username): | |
continue | |
submission_name = f"{username}/{model_name} (*)" | |
e_score_ai = elo_results.get(submission_id, 0.0) | |
overall_expected_score = 0.5 * (metrics["expected_score"] + e_score_ai) | |
row = { | |
"Submission": submission_name, | |
"E [Score] ⬆️": overall_expected_score, | |
"E [Score] (🙋🏻)": metrics["expected_score"], | |
"E [Score] (🤖)": e_score_ai, | |
"Cost ⬇️": result["cost"], | |
"Buz Prec.": metrics["buzz_accuracy"], | |
"Buz Freq.": metrics["buzz_frequency"], | |
"Buzz Position": metrics["buzz_position"], | |
"Win Rate w/ 🙋🏻": metrics.get("human_win_rate", None), | |
} | |
eval_results.append(row) | |
except Exception as e: | |
logger.error(f"Error processing model result for eval_split={eval_split} '{username}/{model_name}': {e}") | |
continue | |
df = pd.DataFrame(eval_results) | |
df.sort_values(by="E [Score] ⬆️", ascending=False, inplace=True) | |
return df | |
def get_bonuses_leaderboard_df( | |
repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None | |
) -> pd.DataFrame: | |
model_results = fetch_model_results(repo_dir, "bonus", eval_split) | |
eval_results = [] | |
for result in model_results: | |
try: | |
metrics = result["metrics"] | |
username = result["username"] | |
model_name = result["model_name"] | |
submission_name = f"{username}/{model_name}" | |
if cutoff_date and cutoff_date < get_submission_date(result): | |
if not qualify_for_private_observation(username, logged_in_username): | |
continue | |
submission_name = f"{username}/{model_name} (*)" | |
row = { | |
"Submission": submission_name, | |
"Cost ⬇️": result["cost"], | |
"Effect ⬆️": metrics["effectiveness"], | |
"Part Acc": metrics["part_accuracy"], | |
"Question Acc": metrics["question_accuracy"], | |
"Calibration": metrics["calibration"], | |
"Adoption": metrics["adoption"], | |
} | |
eval_results.append(row) | |
except Exception as e: | |
logger.exception(f"Error processing model result '{username}/{model_name}': {e}") | |
continue | |
df = pd.DataFrame(eval_results) | |
df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True) | |
return df | |
def colour_pos_neg(v): | |
"""Return a CSS rule for the cell that called the function.""" | |
if pd.isna(v): # keep NaNs unstyled | |
return "" | |
return "color: green;" if float(v) > 0 else "color: red;" | |
def color_cost(v): | |
if pd.isna(v): | |
return "" | |
# Bucket the cost into 5 categories with darker colors | |
cost = float(v) | |
if cost < 1: | |
return "color: #006400;" # dark green | |
elif cost < 2: | |
return "color: #00008b;" # dark blue | |
elif cost < 3: | |
return "color: #8b8b00;" # dark yellow | |
elif cost < 4: | |
return "color: #8b4500;" # dark orange | |
else: | |
return "color: #8b0000;" # dark red | |
# Helper function to bold the highest value in a column | |
def bold_max(s): | |
is_max = s == s.max() | |
return ["font-weight: bold" if v else "" for v in is_max] | |
def highlight_private_row(row): | |
return ["background-color: lightblue" if row["Submission"].endswith("(*)") else "" for _ in row] | |
def fetch_tossup_leaderboard( | |
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None | |
): | |
df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username) | |
# Apply formatting and styling | |
percent_cols = ["Buz Prec.", "Buz Freq.", "Win Rate w/ 🙋🏻"] | |
float_cols = ["E [Score] ⬆️", "E [Score] (🙋🏻)", "E [Score] (🤖)", "Buzz Position"] | |
styled_df = ( | |
df.style.format( | |
{ | |
**dict.fromkeys(percent_cols, "{:>6.1%}"), | |
**dict.fromkeys(float_cols, "{:6.3f}"), | |
"Cost ⬇️": "${:,.2f}", | |
} | |
) | |
.map(colour_pos_neg, subset=["E [Score] ⬆️", "E [Score] (🤖)", "E [Score] (🙋🏻)"]) | |
.map(color_cost, subset=["Cost ⬇️"]) | |
.apply(highlight_private_row, axis=1) | |
.apply( | |
bold_max, | |
subset=[*percent_cols, *float_cols], | |
axis=0, | |
) | |
) | |
return styled_df if style else df | |
def fetch_bonus_leaderboard( | |
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None | |
): | |
df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split, date, username) | |
# Apply formatting and styling | |
styled_df = ( | |
df.style.format( | |
{ | |
"Question Acc": "{:>6.1%}", | |
"Part Acc": "{:>6.1%}", | |
"Effect ⬆️": "{:6.3f}", | |
"Calibration": "{:>6.1%}", | |
"Adoption": "{:>6.1%}", | |
"Cost ⬇️": "${:,.2f}", | |
} | |
) | |
.map(colour_pos_neg, subset=["Effect ⬆️"]) | |
.map(color_cost, subset=["Cost ⬇️"]) | |
.apply(highlight_private_row, axis=1) | |
.apply( | |
bold_max, | |
subset=["Effect ⬆️", "Question Acc", "Part Acc", "Calibration", "Adoption"], | |
axis=0, | |
) | |
) | |
return styled_df if style else df | |
# TODO: Implement this once we have the proxy server running. | |
def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame: | |
# Helper to extract username from 'Submission' (format: username/model_name) | |
def extract_username(submission: str) -> str: | |
username = submission.split("/", 1)[0] if "/" in submission else submission | |
if submission.endswith(" (*)"): | |
username = username + " (*)" | |
return username | |
# Add username columns | |
tossup_df = tossup_df.copy() | |
tossup_df["Username"] = tossup_df["Submission"].apply(extract_username) | |
bonus_df = bonus_df.copy() | |
bonus_df["Username"] = bonus_df["Submission"].apply(extract_username) | |
# Pick best tossup per user (highest Expected Score ⬆️) | |
tossup_best = tossup_df.sort_values("E [Score] ⬆️", ascending=False).drop_duplicates("Username") | |
tossup_best = tossup_best.set_index("Username") | |
# Pick best bonus per user (highest Effect ⬆️) | |
bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username") | |
bonus_best = bonus_best.set_index("Username") | |
# Merge on Username (outer join to include users who have only one type) | |
merged = pd.merge( | |
tossup_best, | |
bonus_best, | |
left_index=True, | |
right_index=True, | |
how="outer", | |
suffixes=("_tossup", "_bonus"), | |
) | |
# Compose a summary row per user | |
# Columns: Username, Tossup Submission, Bonus Submission, all metrics from both | |
leaderboard = pd.DataFrame( | |
{ | |
"Username": merged.index, | |
"Tossup Submission": merged["Submission_tossup"].str.split("/").str[1], | |
"Bonus Submission": merged["Submission_bonus"].str.split("/").str[1], | |
"Overall Score ⬆️": merged[["E [Score] ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1), | |
"Tossup Score ⬆️": merged["E [Score] ⬆️"], | |
"Bonus Effect ⬆️": merged["Effect ⬆️"], | |
"Bonus Part Acc": merged["Part Acc"], | |
"Bonus Adoption": merged["Adoption"], | |
} | |
) | |
leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False) | |
return leaderboard.reset_index(drop=True) | |
def highlight_overall_row(row): | |
return ["background-color: lightblue" if row["Username"].endswith("(*)") else "" for _ in row] | |
def fetch_overall_leaderboard( | |
split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None | |
): | |
bonus_df = fetch_bonus_leaderboard(split, style=False, date=date, username=username) | |
tossup_df = fetch_tossup_leaderboard(split, style=False, date=date, username=username) | |
overall_df = create_overall_leaderboard(tossup_df, bonus_df) | |
# Apply formatting and styling | |
styled_df = ( | |
overall_df.style.format( | |
{ | |
"Overall Score ⬆️": "{:6.3f}", | |
"Tossup Score ⬆️": "{:6.3f}", | |
"Bonus Effect ⬆️": "{:6.3f}", | |
"Bonus Part Acc": "{:>6.1%}", | |
"Bonus Adoption": "{:>6.1%}", | |
}, | |
na_rep="-", | |
) | |
.map(colour_pos_neg, subset=["Overall Score ⬆️"]) | |
.apply(highlight_overall_row, axis=1) | |
.apply( | |
bold_max, | |
subset=["Overall Score ⬆️", "Tossup Score ⬆️", "Bonus Effect ⬆️"], | |
axis=0, | |
) | |
) | |
return styled_df if style else overall_df | |