Spaces:
Running
Running
Maharshi Gor
commited on
Commit
·
b2cdb46
1
Parent(s):
159a0ce
Update with new metrics
Browse files- app.py +14 -5
- metrics_manual.md +15 -10
- src/populate.py +97 -33
app.py
CHANGED
@@ -25,6 +25,7 @@ from src.envs import (
|
|
25 |
from src.hf_dataset_utils import download_dataset_snapshot
|
26 |
from src.populate import (
|
27 |
fetch_bonus_leaderboard,
|
|
|
28 |
fetch_tossup_leaderboard,
|
29 |
)
|
30 |
|
@@ -61,21 +62,22 @@ def refresh_leaderboard(split: str = "tiny_eval", style: bool = True):
|
|
61 |
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
62 |
tossup_df = fetch_tossup_leaderboard(split, style)
|
63 |
bonus_df = fetch_bonus_leaderboard(split, style)
|
64 |
-
|
|
|
65 |
|
66 |
|
67 |
def create_leaderboard_interface(app, split: str = "tiny_eval"):
|
68 |
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
|
69 |
refresh_btn = gr.Button("🔄 Refresh")
|
70 |
|
71 |
-
tossup_df, bonus_df = refresh_leaderboard(split, style=False)
|
72 |
|
73 |
gr.Markdown("## 🛎️ Tossup Round Leaderboard")
|
74 |
logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
|
75 |
tossup_leaderboard = Leaderboard(
|
76 |
value=tossup_df,
|
77 |
search_columns=["Submission"],
|
78 |
-
datatype=["str", "number", "number", "number", "number"
|
79 |
elem_id="tossup-table",
|
80 |
interactive=False, # Ensure it's not interactive
|
81 |
)
|
@@ -87,16 +89,23 @@ def create_leaderboard_interface(app, split: str = "tiny_eval"):
|
|
87 |
bonus_leaderboard = Leaderboard(
|
88 |
value=bonus_df,
|
89 |
search_columns=["Submission"],
|
90 |
-
datatype=["str", "number", "number"],
|
91 |
elem_id="bonus-table",
|
92 |
interactive=False, # Ensure it's not interactive
|
93 |
)
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
gr.on(
|
96 |
triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
|
97 |
fn=refresh_leaderboard,
|
98 |
inputs=[gr.State(split)],
|
99 |
-
outputs=[tossup_leaderboard, bonus_leaderboard],
|
100 |
)
|
101 |
|
102 |
|
|
|
25 |
from src.hf_dataset_utils import download_dataset_snapshot
|
26 |
from src.populate import (
|
27 |
fetch_bonus_leaderboard,
|
28 |
+
fetch_overall_leaderboard,
|
29 |
fetch_tossup_leaderboard,
|
30 |
)
|
31 |
|
|
|
62 |
download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
63 |
tossup_df = fetch_tossup_leaderboard(split, style)
|
64 |
bonus_df = fetch_bonus_leaderboard(split, style)
|
65 |
+
overall_df = fetch_overall_leaderboard(split, style)
|
66 |
+
return tossup_df, bonus_df, overall_df
|
67 |
|
68 |
|
69 |
def create_leaderboard_interface(app, split: str = "tiny_eval"):
|
70 |
leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
|
71 |
refresh_btn = gr.Button("🔄 Refresh")
|
72 |
|
73 |
+
tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=False)
|
74 |
|
75 |
gr.Markdown("## 🛎️ Tossup Round Leaderboard")
|
76 |
logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
|
77 |
tossup_leaderboard = Leaderboard(
|
78 |
value=tossup_df,
|
79 |
search_columns=["Submission"],
|
80 |
+
datatype=["str", "number", "number", "number", "number"],
|
81 |
elem_id="tossup-table",
|
82 |
interactive=False, # Ensure it's not interactive
|
83 |
)
|
|
|
89 |
bonus_leaderboard = Leaderboard(
|
90 |
value=bonus_df,
|
91 |
search_columns=["Submission"],
|
92 |
+
datatype=["str", "number", "number", "number", "number", "number", "number"],
|
93 |
elem_id="bonus-table",
|
94 |
interactive=False, # Ensure it's not interactive
|
95 |
)
|
96 |
|
97 |
+
gr.Markdown("## 🥇 Overall Leaderboard")
|
98 |
+
overall_leaderboard = Leaderboard(
|
99 |
+
value=overall_df,
|
100 |
+
search_columns=["Username", "Tossup Submission", "Bonus Submission"],
|
101 |
+
datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
|
102 |
+
)
|
103 |
+
|
104 |
gr.on(
|
105 |
triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
|
106 |
fn=refresh_leaderboard,
|
107 |
inputs=[gr.State(split)],
|
108 |
+
outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
|
109 |
)
|
110 |
|
111 |
|
metrics_manual.md
CHANGED
@@ -4,31 +4,36 @@ This document explains the metrics displayed on the QANTA 2025 Human-AI Cooperat
|
|
4 |
|
5 |
## Tossup Round Metrics
|
6 |
|
7 |
-
Tossup rounds measure an AI system's ability to answer questions as they're being read:
|
8 |
|
9 |
| Metric | Description |
|
10 |
|--------|-------------|
|
11 |
| **Submission** | The username and model name of the submission (format: `username/model_name`) |
|
12 |
-
| **
|
13 |
-
| **Buzz
|
|
|
14 |
| **Buzz Position** | Average (token) position in the question when the model decides to answer. Lower values indicate earlier buzzing. |
|
15 |
-
| **Win Rate w/ Humans** | Percentage of times the model successfully answers questions when competing with human players. |
|
16 |
|
17 |
## Bonus Round Metrics
|
18 |
|
19 |
-
Bonus rounds test an AI system's ability to answer multi-part questions:
|
20 |
|
21 |
| Metric | Description |
|
22 |
|--------|-------------|
|
23 |
| **Submission** | The username and model name of the submission (format: `username/model_name`) |
|
24 |
-
| **
|
25 |
-
| **
|
|
|
|
|
|
|
26 |
|
27 |
## Understanding the Competition
|
28 |
|
29 |
QANTA (Question Answering is Not a Trivial Activity) is a competition for building AI systems that can answer quiz bowl questions. Quiz bowl is a trivia competition format with:
|
30 |
|
31 |
-
1. **Tossup questions**: Paragraph-length clues read in sequence where players can buzz in at any point to answer
|
32 |
-
2. **Bonus questions**: Multi-part questions that test depth of knowledge in related areas
|
|
|
|
|
33 |
|
34 |
-
The leaderboard tracks how well AI models perform on both question types across different evaluation datasets.
|
|
|
4 |
|
5 |
## Tossup Round Metrics
|
6 |
|
7 |
+
Tossup rounds measure an AI system's ability to answer questions as they're being read, in direct competition with human buzz points:
|
8 |
|
9 |
| Metric | Description |
|
10 |
|--------|-------------|
|
11 |
| **Submission** | The username and model name of the submission (format: `username/model_name`) |
|
12 |
+
| **Expected Score ⬆️** | Average points scored per tossup question, using the point scale: **+1 for a correct answer, -0.5 for an incorrect buzz, 0 for no buzz**. Scores are computed by simulating real competition against human buzz point data: the model only scores if it buzzes before the human, and is penalized if it buzzes incorrectly before the human. |
|
13 |
+
| **Buzz Precision** | Percentage of correct answers when the model decides to buzz in. Displayed as a percentage (e.g., 65.0%). |
|
14 |
+
| **Buzz Frequency** | Percentage of questions where the model buzzes in. Displayed as a percentage (e.g., 65.0%). |
|
15 |
| **Buzz Position** | Average (token) position in the question when the model decides to answer. Lower values indicate earlier buzzing. |
|
16 |
+
| **Win Rate w/ Humans** | Percentage of times the model successfully answers questions when competing with human players before the opponent correctly buzzes. |
|
17 |
|
18 |
## Bonus Round Metrics
|
19 |
|
20 |
+
Bonus rounds test an AI system's ability to answer multi-part questions with right explanation to collaborate with another player. The leaderboard measures the model's effect on a simulated Quizbowl player (Here, `gpt-4o-mini`):
|
21 |
|
22 |
| Metric | Description |
|
23 |
|--------|-------------|
|
24 |
| **Submission** | The username and model name of the submission (format: `username/model_name`) |
|
25 |
+
| **Effect** | The overall effect of the model's responses on a target Quizbowl player's accuracy. Specifically, this is the difference between the net accuracy of a gpt-4o-mini + model team, and the gpt-4o-mini player alone, as measured on the bonus set. In the team setting, the target model samples the response, confidence and explanation to provide the final guess, while the gpt-4o-mini player uses the model's response, confidence and explanation to provide the final guess.|
|
26 |
+
| **Question Acc** | Percentage of bonus questions where all parts were answered correctly. |
|
27 |
+
| **Part Acc** | Percentage of individual bonus question parts answered correctly across all questions. |
|
28 |
+
| **Calibration** | The calibration of the model's confidence in its answers. Specifically, this is calculated as the average of the absolute difference between the confidence score (between 0 and 1) and the binary correctness score (1 for correct, 0 for incorrect), over the bonus set. |
|
29 |
+
| **Adoption** | The percentage of times the target model adopts the model's guess, confidence and explanation to provide the final guess, as opposed to using its own. |
|
30 |
|
31 |
## Understanding the Competition
|
32 |
|
33 |
QANTA (Question Answering is Not a Trivial Activity) is a competition for building AI systems that can answer quiz bowl questions. Quiz bowl is a trivia competition format with:
|
34 |
|
35 |
+
1. **Tossup questions**: Paragraph-length clues read in sequence where players can buzz in at any point to answer. The leaderboard simulates real competition by using human buzz point data for scoring.
|
36 |
+
2. **Bonus questions**: Multi-part questions that test depth of knowledge in related areas. The leaderboard measures the effect of models in a team setting with a simulated human (gpt-4o-mini).
|
37 |
+
|
38 |
+
The leaderboard tracks how well AI models perform on both question types across different evaluation datasets, using these updated, competition-realistic metrics.
|
39 |
|
|
src/populate.py
CHANGED
@@ -39,20 +39,15 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
|
39 |
metrics = result["metrics"]
|
40 |
username = result["username"]
|
41 |
model_name = result["model_name"]
|
42 |
-
buzz_accuracy = metrics["buzz_accuracy"]
|
43 |
|
44 |
row = {
|
45 |
"Submission": f"{username}/{model_name}",
|
46 |
-
"
|
47 |
-
"Buzz
|
|
|
48 |
"Buzz Position": metrics["buzz_position"],
|
|
|
49 |
}
|
50 |
-
if "human_win_rate" in metrics:
|
51 |
-
row["Win Rate w/ Humans"] = metrics["human_win_rate"]
|
52 |
-
# row["Win Rate w/ Humans (Aggressive)"] = metrics["human_win_rate_strict"]
|
53 |
-
else:
|
54 |
-
row["Win Rate w/ Humans"] = None
|
55 |
-
# row["Win Rate w/ Humans (Aggressive)"] = None
|
56 |
eval_results.append(row)
|
57 |
except Exception as e:
|
58 |
logger.error(f"Error processing model result '{username}/{model_name}': {e}")
|
@@ -62,14 +57,14 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
|
62 |
eval_results,
|
63 |
columns=[
|
64 |
"Submission",
|
65 |
-
"
|
66 |
-
"Buzz
|
|
|
67 |
"Buzz Position",
|
68 |
"Win Rate w/ Humans",
|
69 |
-
# "Win Rate w/ Humans (Aggressive)",
|
70 |
],
|
71 |
)
|
72 |
-
df.sort_values(by="
|
73 |
return df
|
74 |
|
75 |
|
@@ -85,8 +80,11 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
|
85 |
|
86 |
row = {
|
87 |
"Submission": f"{username}/{model_name}",
|
88 |
-
"
|
89 |
-
"Part
|
|
|
|
|
|
|
90 |
}
|
91 |
eval_results.append(row)
|
92 |
except Exception as e:
|
@@ -95,31 +93,32 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
|
|
95 |
|
96 |
df = pd.DataFrame(
|
97 |
eval_results,
|
98 |
-
columns=["Submission", "
|
99 |
)
|
100 |
-
df.sort_values(by="
|
101 |
return df
|
102 |
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
|
105 |
df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
|
106 |
|
107 |
-
def colour_pos_neg(v):
|
108 |
-
"""Return a CSS rule for the cell that called the function."""
|
109 |
-
if pd.isna(v): # keep NaNs unstyled
|
110 |
-
return ""
|
111 |
-
return "color: green;" if v > 0 else "color: red;"
|
112 |
-
|
113 |
# Apply formatting and styling
|
114 |
styled_df = df.style.format(
|
115 |
{
|
116 |
-
"
|
117 |
-
"Buzz
|
118 |
"Buzz Position": "{:>6.1f}",
|
|
|
119 |
"Win Rate w/ Humans": "{:>6.1%}",
|
120 |
-
# "Win Rate w/ Humans (Aggressive)": "{:>6.1%}",
|
121 |
}
|
122 |
-
).map(colour_pos_neg, subset=["
|
123 |
|
124 |
return styled_df if style else df
|
125 |
|
@@ -130,17 +129,82 @@ def fetch_bonus_leaderboard(split: str = "tiny_eval", style: bool = True):
|
|
130 |
# Apply formatting and styling
|
131 |
styled_df = df.style.format(
|
132 |
{
|
133 |
-
"Question
|
134 |
-
"Part
|
|
|
|
|
|
|
135 |
}
|
136 |
-
)
|
137 |
|
138 |
return styled_df if style else df
|
139 |
|
140 |
|
141 |
# TODO: Implement this once we have the proxy server running.
|
142 |
def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
|
143 |
-
#
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
metrics = result["metrics"]
|
40 |
username = result["username"]
|
41 |
model_name = result["model_name"]
|
|
|
42 |
|
43 |
row = {
|
44 |
"Submission": f"{username}/{model_name}",
|
45 |
+
"Expected Score ⬆️": metrics["expected_score"],
|
46 |
+
"Buzz Precision": metrics["buzz_accuracy"],
|
47 |
+
"Buzz Frequency": metrics["buzz_frequency"],
|
48 |
"Buzz Position": metrics["buzz_position"],
|
49 |
+
"Win Rate w/ Humans": metrics.get("human_win_rate", None),
|
50 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
eval_results.append(row)
|
52 |
except Exception as e:
|
53 |
logger.error(f"Error processing model result '{username}/{model_name}': {e}")
|
|
|
57 |
eval_results,
|
58 |
columns=[
|
59 |
"Submission",
|
60 |
+
"Expected Score ⬆️",
|
61 |
+
"Buzz Precision",
|
62 |
+
"Buzz Frequency",
|
63 |
"Buzz Position",
|
64 |
"Win Rate w/ Humans",
|
|
|
65 |
],
|
66 |
)
|
67 |
+
df.sort_values(by="Expected Score ⬆️", ascending=False, inplace=True)
|
68 |
return df
|
69 |
|
70 |
|
|
|
80 |
|
81 |
row = {
|
82 |
"Submission": f"{username}/{model_name}",
|
83 |
+
"Effect ⬆️": metrics["effectiveness"],
|
84 |
+
"Part Acc": metrics["part_accuracy"],
|
85 |
+
"Question Acc": metrics["question_accuracy"],
|
86 |
+
"Calibration": metrics["calibration"],
|
87 |
+
"Adoption": metrics["adoption"],
|
88 |
}
|
89 |
eval_results.append(row)
|
90 |
except Exception as e:
|
|
|
93 |
|
94 |
df = pd.DataFrame(
|
95 |
eval_results,
|
96 |
+
columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
|
97 |
)
|
98 |
+
df.sort_values(by="Effect ⬆️", ascending=False, inplace=True)
|
99 |
return df
|
100 |
|
101 |
|
102 |
+
def colour_pos_neg(v):
|
103 |
+
"""Return a CSS rule for the cell that called the function."""
|
104 |
+
if pd.isna(v): # keep NaNs unstyled
|
105 |
+
return ""
|
106 |
+
return "color: green;" if v > 0 else "color: red;"
|
107 |
+
|
108 |
+
|
109 |
def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
|
110 |
df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
# Apply formatting and styling
|
113 |
styled_df = df.style.format(
|
114 |
{
|
115 |
+
"Expected Score ⬆️": "{:5.2f}",
|
116 |
+
"Buzz Precision": "{:>6.1%}",
|
117 |
"Buzz Position": "{:>6.1f}",
|
118 |
+
"Buzz Frequency": "{:>6.1f}",
|
119 |
"Win Rate w/ Humans": "{:>6.1%}",
|
|
|
120 |
}
|
121 |
+
).map(colour_pos_neg, subset=["Expected Score ⬆️"])
|
122 |
|
123 |
return styled_df if style else df
|
124 |
|
|
|
129 |
# Apply formatting and styling
|
130 |
styled_df = df.style.format(
|
131 |
{
|
132 |
+
"Question Acc": "{:>6.1%}",
|
133 |
+
"Part Acc": "{:>6.1%}",
|
134 |
+
"Effect ⬆️": "{:5.2f}",
|
135 |
+
"Calibration": "{:>6.1%}",
|
136 |
+
"Adoption": "{:>6.1%}",
|
137 |
}
|
138 |
+
).map(colour_pos_neg, subset=["Effect ⬆️"])
|
139 |
|
140 |
return styled_df if style else df
|
141 |
|
142 |
|
143 |
# TODO: Implement this once we have the proxy server running.
|
144 |
def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
|
145 |
+
# Helper to extract username from 'Submission' (format: username/model_name)
|
146 |
+
def extract_username(submission: str) -> str:
|
147 |
+
return submission.split("/", 1)[0] if "/" in submission else submission
|
148 |
+
|
149 |
+
# Add username columns
|
150 |
+
tossup_df = tossup_df.copy()
|
151 |
+
tossup_df["Username"] = tossup_df["Submission"].apply(extract_username)
|
152 |
+
bonus_df = bonus_df.copy()
|
153 |
+
bonus_df["Username"] = bonus_df["Submission"].apply(extract_username)
|
154 |
+
|
155 |
+
# Pick best tossup per user (highest Expected Score ⬆️)
|
156 |
+
tossup_best = tossup_df.sort_values("Expected Score ⬆️", ascending=False).drop_duplicates("Username")
|
157 |
+
tossup_best = tossup_best.set_index("Username")
|
158 |
+
|
159 |
+
# Pick best bonus per user (highest Effect ⬆️)
|
160 |
+
bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username")
|
161 |
+
bonus_best = bonus_best.set_index("Username")
|
162 |
+
|
163 |
+
# Merge on Username (outer join to include users who have only one type)
|
164 |
+
merged = pd.merge(
|
165 |
+
tossup_best,
|
166 |
+
bonus_best,
|
167 |
+
left_index=True,
|
168 |
+
right_index=True,
|
169 |
+
how="outer",
|
170 |
+
suffixes=("_tossup", "_bonus"),
|
171 |
+
)
|
172 |
|
173 |
+
# Compose a summary row per user
|
174 |
+
# Columns: Username, Tossup Submission, Bonus Submission, all metrics from both
|
175 |
+
leaderboard = pd.DataFrame(
|
176 |
+
{
|
177 |
+
"Username": merged.index,
|
178 |
+
"Tossup Submission": merged["Submission_tossup"].str.split("/").str[1],
|
179 |
+
"Bonus Submission": merged["Submission_bonus"].str.split("/").str[1],
|
180 |
+
"Overall Score ⬆️": merged[["Expected Score ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1),
|
181 |
+
"Expected Score (Tossup) ⬆️": merged["Expected Score ⬆️"],
|
182 |
+
"Effect (Bonus) ⬆️": merged["Effect ⬆️"],
|
183 |
+
"Part Acc (Bonus)": merged["Part Acc"],
|
184 |
+
"Adoption (Bonus)": merged["Adoption"],
|
185 |
+
}
|
186 |
+
)
|
187 |
+
|
188 |
+
leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False)
|
189 |
+
|
190 |
+
return leaderboard.reset_index(drop=True)
|
191 |
+
|
192 |
+
|
193 |
+
def fetch_overall_leaderboard(split: str = "tiny_eval", style: bool = True):
|
194 |
+
bonus_df = fetch_bonus_leaderboard(split, style=False)
|
195 |
+
tossup_df = fetch_tossup_leaderboard(split, style=False)
|
196 |
+
overall_df = create_overall_leaderboard(tossup_df, bonus_df)
|
197 |
+
|
198 |
+
# Apply formatting and styling
|
199 |
+
styled_df = overall_df.style.format(
|
200 |
+
{
|
201 |
+
"Overall Score ⬆️": "{:5.2f}",
|
202 |
+
"Expected Score (Tossup) ⬆️": "{:5.2f}",
|
203 |
+
"Effect (Bonus) ⬆️": "{:5.2f}",
|
204 |
+
"Part Acc (Bonus)": "{:>6.1%}",
|
205 |
+
"Adoption (Bonus)": "{:>6.1%}",
|
206 |
+
},
|
207 |
+
na_rep="-",
|
208 |
+
).map(colour_pos_neg, subset=["Overall Score ⬆️"])
|
209 |
+
|
210 |
+
return styled_df if style else overall_df
|