Spaces:

qanta-challenge
/

leaderboard

Running

leaderboard / src /populate.py

Maharshi Gor

Cost information and model-model comparison

025f1f3 26 days ago

11.9 kB

	# This file is kept for reference only and is not used in the enhanced implementation
	# The actual implementation is in enhanced_leaderboard.py

	import datetime
	import json
	import os

	import pandas as pd
	from loguru import logger

	from src.envs import ADMIN_USERS, EVAL_RESULTS_PATH


	def fetch_model_results(repo_dir: str, competition_type: str, eval_split: str) -> list[dict]:
	model_results = []
	dirpath = os.path.join(repo_dir, competition_type, eval_split)
	for root, _, files in os.walk(dirpath):
	if len(files) == 0 or not all(f.endswith(".json") for f in files):
	continue
	for file in files:
	# Check if the file name is a valid submission id
	if not file.startswith(f"{competition_type}__"):
	continue
	filepath = os.path.join(root, file)
	try:
	with open(filepath, "r") as fp:
	result = json.load(fp)
	model_results.append(result)
	except Exception as e:
	logger.error(f"Error loading model result from {filepath}: {e}")
	continue

	return model_results


	def fetch_tossup_elo_results(repo_dir: str, eval_split: str) -> list[dict]:
	elo_results = []
	dirpath = os.path.join(repo_dir, "tossup", eval_split)
	filepath = os.path.join(dirpath, "elo_results.json")
	with open(filepath, "r") as fp:
	elo_results = json.load(fp)
	return elo_results


	def get_submission_date(result: dict) -> datetime.date:
	submission_id = result["id"]
	datetime_str = submission_id.split("__")[-3]
	# str format is YYYYMMDD_HHMMSS in UTC. Convert to eastern time date
	datetime_obj = datetime.datetime.strptime(datetime_str, "%Y%m%d_%H%M%S")
	return datetime_obj.astimezone(datetime.timezone(datetime.timedelta(hours=-5))).date()


	def qualify_for_private_observation(username: str, logged_in_username: str \| None) -> bool:
	if not logged_in_username:
	return False
	if logged_in_username in ADMIN_USERS:
	return True
	if logged_in_username == username:
	return True
	return False


	def get_tossups_leaderboard_df(
	repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
	) -> pd.DataFrame:
	model_results = fetch_model_results(repo_dir, "tossup", eval_split)
	elo_results = fetch_tossup_elo_results(repo_dir, eval_split)

	eval_results = []
	for result in model_results:
	try:
	submission_id = result["id"]
	metrics = result["metrics"]
	username = result["username"]
	model_name = result["model_name"]
	submission_name = f"{username}/{model_name}"
	if cutoff_date and cutoff_date < get_submission_date(result):
	if not qualify_for_private_observation(username, logged_in_username):
	continue
	submission_name = f"{username}/{model_name} (*)"
	e_score_ai = elo_results.get(submission_id, 0.0)
	overall_expected_score = 0.5 * (metrics["expected_score"] + e_score_ai)
	row = {
	"Submission": submission_name,
	"E [Score] ⬆️": overall_expected_score,
	"E [Score] (🙋🏻)": metrics["expected_score"],
	"E [Score] (🤖)": e_score_ai,
	"Cost ⬇️": result["cost"],
	"Buz Prec.": metrics["buzz_accuracy"],
	"Buz Freq.": metrics["buzz_frequency"],
	"Buzz Position": metrics["buzz_position"],
	"Win Rate w/ 🙋🏻": metrics.get("human_win_rate", None),
	}
	eval_results.append(row)
	except Exception as e:
	logger.error(f"Error processing model result for eval_split={eval_split} '{username}/{model_name}': {e}")
	continue

	df = pd.DataFrame(eval_results)
	df.sort_values(by="E [Score] ⬆️", ascending=False, inplace=True)
	return df


	def get_bonuses_leaderboard_df(
	repo_dir: str, eval_split: str, cutoff_date: datetime.date = None, logged_in_username: str = None
	) -> pd.DataFrame:
	model_results = fetch_model_results(repo_dir, "bonus", eval_split)

	eval_results = []
	for result in model_results:
	try:
	metrics = result["metrics"]
	username = result["username"]
	model_name = result["model_name"]
	submission_name = f"{username}/{model_name}"
	if cutoff_date and cutoff_date < get_submission_date(result):
	if not qualify_for_private_observation(username, logged_in_username):
	continue
	submission_name = f"{username}/{model_name} (*)"

	row = {
	"Submission": submission_name,
	"Cost ⬇️": result["cost"],
	"Effect ⬆️": metrics["effectiveness"],
	"Part Acc": metrics["part_accuracy"],
	"Question Acc": metrics["question_accuracy"],
	"Calibration": metrics["calibration"],
	"Adoption": metrics["adoption"],
	}
	eval_results.append(row)
	except Exception as e:
	logger.exception(f"Error processing model result '{username}/{model_name}': {e}")
	continue

	df = pd.DataFrame(eval_results)
	df.sort_values(by=["Effect ⬆️", "Question Acc", "Part Acc"], ascending=False, inplace=True)
	return df


	def colour_pos_neg(v):
	"""Return a CSS rule for the cell that called the function."""
	if pd.isna(v): # keep NaNs unstyled
	return ""
	return "color: green;" if float(v) > 0 else "color: red;"


	def color_cost(v):
	if pd.isna(v):
	return ""
	# Bucket the cost into 5 categories with darker colors
	cost = float(v)
	if cost < 1:
	return "color: #006400;" # dark green
	elif cost < 2:
	return "color: #00008b;" # dark blue
	elif cost < 3:
	return "color: #8b8b00;" # dark yellow
	elif cost < 4:
	return "color: #8b4500;" # dark orange
	else:
	return "color: #8b0000;" # dark red


	# Helper function to bold the highest value in a column
	def bold_max(s):
	is_max = s == s.max()
	return ["font-weight: bold" if v else "" for v in is_max]


	def highlight_private_row(row):
	return ["background-color: lightblue" if row["Submission"].endswith("(*)") else "" for _ in row]


	def fetch_tossup_leaderboard(
	split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
	):
	df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)

	# Apply formatting and styling
	percent_cols = ["Buz Prec.", "Buz Freq.", "Win Rate w/ 🙋🏻"]
	float_cols = ["E [Score] ⬆️", "E [Score] (🙋🏻)", "E [Score] (🤖)", "Buzz Position"]
	styled_df = (
	df.style.format(
	{
	**dict.fromkeys(percent_cols, "{:>6.1%}"),
	**dict.fromkeys(float_cols, "{:6.3f}"),
	"Cost ⬇️": "${:,.2f}",
	}
	)
	.map(colour_pos_neg, subset=["E [Score] ⬆️", "E [Score] (🤖)", "E [Score] (🙋🏻)"])
	.map(color_cost, subset=["Cost ⬇️"])
	.apply(highlight_private_row, axis=1)
	.apply(
	bold_max,
	subset=[percent_cols, float_cols],
	axis=0,
	)
	)

	return styled_df if style else df


	def fetch_bonus_leaderboard(
	split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
	):
	df = get_bonuses_leaderboard_df(EVAL_RESULTS_PATH, split, date, username)

	# Apply formatting and styling
	styled_df = (
	df.style.format(
	{
	"Question Acc": "{:>6.1%}",
	"Part Acc": "{:>6.1%}",
	"Effect ⬆️": "{:6.3f}",
	"Calibration": "{:>6.1%}",
	"Adoption": "{:>6.1%}",
	"Cost ⬇️": "${:,.2f}",
	}
	)
	.map(colour_pos_neg, subset=["Effect ⬆️"])
	.map(color_cost, subset=["Cost ⬇️"])
	.apply(highlight_private_row, axis=1)
	.apply(
	bold_max,
	subset=["Effect ⬆️", "Question Acc", "Part Acc", "Calibration", "Adoption"],
	axis=0,
	)
	)

	return styled_df if style else df


	# TODO: Implement this once we have the proxy server running.
	def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
	# Helper to extract username from 'Submission' (format: username/model_name)
	def extract_username(submission: str) -> str:
	username = submission.split("/", 1)[0] if "/" in submission else submission
	if submission.endswith(" (*)"):
	username = username + " (*)"
	return username

	# Add username columns
	tossup_df = tossup_df.copy()
	tossup_df["Username"] = tossup_df["Submission"].apply(extract_username)
	bonus_df = bonus_df.copy()
	bonus_df["Username"] = bonus_df["Submission"].apply(extract_username)

	# Pick best tossup per user (highest Expected Score ⬆️)
	tossup_best = tossup_df.sort_values("E [Score] ⬆️", ascending=False).drop_duplicates("Username")
	tossup_best = tossup_best.set_index("Username")

	# Pick best bonus per user (highest Effect ⬆️)
	bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username")
	bonus_best = bonus_best.set_index("Username")

	# Merge on Username (outer join to include users who have only one type)
	merged = pd.merge(
	tossup_best,
	bonus_best,
	left_index=True,
	right_index=True,
	how="outer",
	suffixes=("_tossup", "_bonus"),
	)

	# Compose a summary row per user
	# Columns: Username, Tossup Submission, Bonus Submission, all metrics from both
	leaderboard = pd.DataFrame(
	{
	"Username": merged.index,
	"Tossup Submission": merged["Submission_tossup"].str.split("/").str[1],
	"Bonus Submission": merged["Submission_bonus"].str.split("/").str[1],
	"Overall Score ⬆️": merged[["E [Score] ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1),
	"Tossup Score ⬆️": merged["E [Score] ⬆️"],
	"Bonus Effect ⬆️": merged["Effect ⬆️"],
	"Bonus Part Acc": merged["Part Acc"],
	"Bonus Adoption": merged["Adoption"],
	}
	)

	leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False)

	return leaderboard.reset_index(drop=True)


	def highlight_overall_row(row):
	return ["background-color: lightblue" if row["Username"].endswith("(*)") else "" for _ in row]


	def fetch_overall_leaderboard(
	split: str = "tiny_eval", style: bool = True, date: datetime.date = None, username: str = None
	):
	bonus_df = fetch_bonus_leaderboard(split, style=False, date=date, username=username)
	tossup_df = fetch_tossup_leaderboard(split, style=False, date=date, username=username)
	overall_df = create_overall_leaderboard(tossup_df, bonus_df)

	# Apply formatting and styling
	styled_df = (
	overall_df.style.format(
	{
	"Overall Score ⬆️": "{:6.3f}",
	"Tossup Score ⬆️": "{:6.3f}",
	"Bonus Effect ⬆️": "{:6.3f}",
	"Bonus Part Acc": "{:>6.1%}",
	"Bonus Adoption": "{:>6.1%}",
	},
	na_rep="-",
	)
	.map(colour_pos_neg, subset=["Overall Score ⬆️"])
	.apply(highlight_overall_row, axis=1)
	.apply(
	bold_max,
	subset=["Overall Score ⬆️", "Tossup Score ⬆️", "Bonus Effect ⬆️"],
	axis=0,
	)
	)

	return styled_df if style else overall_df