Maharshi Gor commited on
Commit
b2cdb46
·
1 Parent(s): 159a0ce

Update with new metrics

Browse files
Files changed (3) hide show
  1. app.py +14 -5
  2. metrics_manual.md +15 -10
  3. src/populate.py +97 -33
app.py CHANGED
@@ -25,6 +25,7 @@ from src.envs import (
25
  from src.hf_dataset_utils import download_dataset_snapshot
26
  from src.populate import (
27
  fetch_bonus_leaderboard,
 
28
  fetch_tossup_leaderboard,
29
  )
30
 
@@ -61,21 +62,22 @@ def refresh_leaderboard(split: str = "tiny_eval", style: bool = True):
61
  download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
62
  tossup_df = fetch_tossup_leaderboard(split, style)
63
  bonus_df = fetch_bonus_leaderboard(split, style)
64
- return tossup_df, bonus_df
 
65
 
66
 
67
  def create_leaderboard_interface(app, split: str = "tiny_eval"):
68
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
69
  refresh_btn = gr.Button("🔄 Refresh")
70
 
71
- tossup_df, bonus_df = refresh_leaderboard(split, style=False)
72
 
73
  gr.Markdown("## 🛎️ Tossup Round Leaderboard")
74
  logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
75
  tossup_leaderboard = Leaderboard(
76
  value=tossup_df,
77
  search_columns=["Submission"],
78
- datatype=["str", "number", "number", "number", "number", "number"],
79
  elem_id="tossup-table",
80
  interactive=False, # Ensure it's not interactive
81
  )
@@ -87,16 +89,23 @@ def create_leaderboard_interface(app, split: str = "tiny_eval"):
87
  bonus_leaderboard = Leaderboard(
88
  value=bonus_df,
89
  search_columns=["Submission"],
90
- datatype=["str", "number", "number"],
91
  elem_id="bonus-table",
92
  interactive=False, # Ensure it's not interactive
93
  )
94
 
 
 
 
 
 
 
 
95
  gr.on(
96
  triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
97
  fn=refresh_leaderboard,
98
  inputs=[gr.State(split)],
99
- outputs=[tossup_leaderboard, bonus_leaderboard],
100
  )
101
 
102
 
 
25
  from src.hf_dataset_utils import download_dataset_snapshot
26
  from src.populate import (
27
  fetch_bonus_leaderboard,
28
+ fetch_overall_leaderboard,
29
  fetch_tossup_leaderboard,
30
  )
31
 
 
62
  download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
63
  tossup_df = fetch_tossup_leaderboard(split, style)
64
  bonus_df = fetch_bonus_leaderboard(split, style)
65
+ overall_df = fetch_overall_leaderboard(split, style)
66
+ return tossup_df, bonus_df, overall_df
67
 
68
 
69
  def create_leaderboard_interface(app, split: str = "tiny_eval"):
70
  leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
71
  refresh_btn = gr.Button("🔄 Refresh")
72
 
73
+ tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=False)
74
 
75
  gr.Markdown("## 🛎️ Tossup Round Leaderboard")
76
  logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
77
  tossup_leaderboard = Leaderboard(
78
  value=tossup_df,
79
  search_columns=["Submission"],
80
+ datatype=["str", "number", "number", "number", "number"],
81
  elem_id="tossup-table",
82
  interactive=False, # Ensure it's not interactive
83
  )
 
89
  bonus_leaderboard = Leaderboard(
90
  value=bonus_df,
91
  search_columns=["Submission"],
92
+ datatype=["str", "number", "number", "number", "number", "number", "number"],
93
  elem_id="bonus-table",
94
  interactive=False, # Ensure it's not interactive
95
  )
96
 
97
+ gr.Markdown("## 🥇 Overall Leaderboard")
98
+ overall_leaderboard = Leaderboard(
99
+ value=overall_df,
100
+ search_columns=["Username", "Tossup Submission", "Bonus Submission"],
101
+ datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
102
+ )
103
+
104
  gr.on(
105
  triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
106
  fn=refresh_leaderboard,
107
  inputs=[gr.State(split)],
108
+ outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
109
  )
110
 
111
 
metrics_manual.md CHANGED
@@ -4,31 +4,36 @@ This document explains the metrics displayed on the QANTA 2025 Human-AI Cooperat
4
 
5
  ## Tossup Round Metrics
6
 
7
- Tossup rounds measure an AI system's ability to answer questions as they're being read:
8
 
9
  | Metric | Description |
10
  |--------|-------------|
11
  | **Submission** | The username and model name of the submission (format: `username/model_name`) |
12
- | **Avg Score ⬆️** | Average points scored per tossup question. 10 points is the maximum score per question. -5 point for incorrect buzzes, 0 for no buzz. Positive scores (green) indicate good performance, while negative scores (red) indicate penalties for incorrect answers. |
13
- | **Buzz Accuracy** | Percentage of correct answers when the model decides to buzz in. Displayed as a percentage (e.g., 65.0%). |
 
14
  | **Buzz Position** | Average (token) position in the question when the model decides to answer. Lower values indicate earlier buzzing. |
15
- | **Win Rate w/ Humans** | Percentage of times the model successfully answers questions when competing with human players. |
16
 
17
  ## Bonus Round Metrics
18
 
19
- Bonus rounds test an AI system's ability to answer multi-part questions:
20
 
21
  | Metric | Description |
22
  |--------|-------------|
23
  | **Submission** | The username and model name of the submission (format: `username/model_name`) |
24
- | **Question Accuracy** | Percentage of bonus questions where all parts were answered correctly. |
25
- | **Part Accuracy** | Percentage of individual bonus question parts answered correctly across all questions. |
 
 
 
26
 
27
  ## Understanding the Competition
28
 
29
  QANTA (Question Answering is Not a Trivial Activity) is a competition for building AI systems that can answer quiz bowl questions. Quiz bowl is a trivia competition format with:
30
 
31
- 1. **Tossup questions**: Paragraph-length clues read in sequence where players can buzz in at any point to answer
32
- 2. **Bonus questions**: Multi-part questions that test depth of knowledge in related areas
 
 
33
 
34
- The leaderboard tracks how well AI models perform on both question types across different evaluation datasets.
 
4
 
5
  ## Tossup Round Metrics
6
 
7
+ Tossup rounds measure an AI system's ability to answer questions as they're being read, in direct competition with human buzz points:
8
 
9
  | Metric | Description |
10
  |--------|-------------|
11
  | **Submission** | The username and model name of the submission (format: `username/model_name`) |
12
+ | **Expected Score ⬆️** | Average points scored per tossup question, using the point scale: **+1 for a correct answer, -0.5 for an incorrect buzz, 0 for no buzz**. Scores are computed by simulating real competition against human buzz point data: the model only scores if it buzzes before the human, and is penalized if it buzzes incorrectly before the human. |
13
+ | **Buzz Precision** | Percentage of correct answers when the model decides to buzz in. Displayed as a percentage (e.g., 65.0%). |
14
+ | **Buzz Frequency** | Percentage of questions where the model buzzes in. Displayed as a percentage (e.g., 65.0%). |
15
  | **Buzz Position** | Average (token) position in the question when the model decides to answer. Lower values indicate earlier buzzing. |
16
+ | **Win Rate w/ Humans** | Percentage of times the model successfully answers questions when competing with human players before the opponent correctly buzzes. |
17
 
18
  ## Bonus Round Metrics
19
 
20
+ Bonus rounds test an AI system's ability to answer multi-part questions with right explanation to collaborate with another player. The leaderboard measures the model's effect on a simulated Quizbowl player (Here, `gpt-4o-mini`):
21
 
22
  | Metric | Description |
23
  |--------|-------------|
24
  | **Submission** | The username and model name of the submission (format: `username/model_name`) |
25
+ | **Effect** | The overall effect of the model's responses on a target Quizbowl player's accuracy. Specifically, this is the difference between the net accuracy of a gpt-4o-mini + model team, and the gpt-4o-mini player alone, as measured on the bonus set. In the team setting, the target model samples the response, confidence and explanation to provide the final guess, while the gpt-4o-mini player uses the model's response, confidence and explanation to provide the final guess.|
26
+ | **Question Acc** | Percentage of bonus questions where all parts were answered correctly. |
27
+ | **Part Acc** | Percentage of individual bonus question parts answered correctly across all questions. |
28
+ | **Calibration** | The calibration of the model's confidence in its answers. Specifically, this is calculated as the average of the absolute difference between the confidence score (between 0 and 1) and the binary correctness score (1 for correct, 0 for incorrect), over the bonus set. |
29
+ | **Adoption** | The percentage of times the target model adopts the model's guess, confidence and explanation to provide the final guess, as opposed to using its own. |
30
 
31
  ## Understanding the Competition
32
 
33
  QANTA (Question Answering is Not a Trivial Activity) is a competition for building AI systems that can answer quiz bowl questions. Quiz bowl is a trivia competition format with:
34
 
35
+ 1. **Tossup questions**: Paragraph-length clues read in sequence where players can buzz in at any point to answer. The leaderboard simulates real competition by using human buzz point data for scoring.
36
+ 2. **Bonus questions**: Multi-part questions that test depth of knowledge in related areas. The leaderboard measures the effect of models in a team setting with a simulated human (gpt-4o-mini).
37
+
38
+ The leaderboard tracks how well AI models perform on both question types across different evaluation datasets, using these updated, competition-realistic metrics.
39
 
 
src/populate.py CHANGED
@@ -39,20 +39,15 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
39
  metrics = result["metrics"]
40
  username = result["username"]
41
  model_name = result["model_name"]
42
- buzz_accuracy = metrics["buzz_accuracy"]
43
 
44
  row = {
45
  "Submission": f"{username}/{model_name}",
46
- "Avg Score ⬆️": metrics["tossup_score"],
47
- "Buzz Accuracy": buzz_accuracy,
 
48
  "Buzz Position": metrics["buzz_position"],
 
49
  }
50
- if "human_win_rate" in metrics:
51
- row["Win Rate w/ Humans"] = metrics["human_win_rate"]
52
- # row["Win Rate w/ Humans (Aggressive)"] = metrics["human_win_rate_strict"]
53
- else:
54
- row["Win Rate w/ Humans"] = None
55
- # row["Win Rate w/ Humans (Aggressive)"] = None
56
  eval_results.append(row)
57
  except Exception as e:
58
  logger.error(f"Error processing model result '{username}/{model_name}': {e}")
@@ -62,14 +57,14 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
62
  eval_results,
63
  columns=[
64
  "Submission",
65
- "Avg Score ⬆️",
66
- "Buzz Accuracy",
 
67
  "Buzz Position",
68
  "Win Rate w/ Humans",
69
- # "Win Rate w/ Humans (Aggressive)",
70
  ],
71
  )
72
- df.sort_values(by="Avg Score ⬆️", ascending=False, inplace=True)
73
  return df
74
 
75
 
@@ -85,8 +80,11 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
85
 
86
  row = {
87
  "Submission": f"{username}/{model_name}",
88
- "Question Accuracy": metrics["question_accuracy"],
89
- "Part Accuracy": metrics["part_accuracy"],
 
 
 
90
  }
91
  eval_results.append(row)
92
  except Exception as e:
@@ -95,31 +93,32 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
95
 
96
  df = pd.DataFrame(
97
  eval_results,
98
- columns=["Submission", "Question Accuracy", "Part Accuracy"],
99
  )
100
- df.sort_values(by="Question Accuracy", ascending=False, inplace=True)
101
  return df
102
 
103
 
 
 
 
 
 
 
 
104
  def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
105
  df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
106
 
107
- def colour_pos_neg(v):
108
- """Return a CSS rule for the cell that called the function."""
109
- if pd.isna(v): # keep NaNs unstyled
110
- return ""
111
- return "color: green;" if v > 0 else "color: red;"
112
-
113
  # Apply formatting and styling
114
  styled_df = df.style.format(
115
  {
116
- "Avg Score ⬆️": "{:5.2f}",
117
- "Buzz Accuracy": "{:>6.1%}",
118
  "Buzz Position": "{:>6.1f}",
 
119
  "Win Rate w/ Humans": "{:>6.1%}",
120
- # "Win Rate w/ Humans (Aggressive)": "{:>6.1%}",
121
  }
122
- ).map(colour_pos_neg, subset=["Avg Score ⬆️"])
123
 
124
  return styled_df if style else df
125
 
@@ -130,17 +129,82 @@ def fetch_bonus_leaderboard(split: str = "tiny_eval", style: bool = True):
130
  # Apply formatting and styling
131
  styled_df = df.style.format(
132
  {
133
- "Question Accuracy": "{:>6.1%}",
134
- "Part Accuracy": "{:>6.1%}",
 
 
 
135
  }
136
- )
137
 
138
  return styled_df if style else df
139
 
140
 
141
  # TODO: Implement this once we have the proxy server running.
142
  def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
143
- # Merge the two dataframes on the 'Submission' column
144
- merged_df = pd.merge(tossup_df, bonus_df, on="Submission", how="outer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # Calculate the overall score as a weighted average
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  metrics = result["metrics"]
40
  username = result["username"]
41
  model_name = result["model_name"]
 
42
 
43
  row = {
44
  "Submission": f"{username}/{model_name}",
45
+ "Expected Score ⬆️": metrics["expected_score"],
46
+ "Buzz Precision": metrics["buzz_accuracy"],
47
+ "Buzz Frequency": metrics["buzz_frequency"],
48
  "Buzz Position": metrics["buzz_position"],
49
+ "Win Rate w/ Humans": metrics.get("human_win_rate", None),
50
  }
 
 
 
 
 
 
51
  eval_results.append(row)
52
  except Exception as e:
53
  logger.error(f"Error processing model result '{username}/{model_name}': {e}")
 
57
  eval_results,
58
  columns=[
59
  "Submission",
60
+ "Expected Score ⬆️",
61
+ "Buzz Precision",
62
+ "Buzz Frequency",
63
  "Buzz Position",
64
  "Win Rate w/ Humans",
 
65
  ],
66
  )
67
+ df.sort_values(by="Expected Score ⬆️", ascending=False, inplace=True)
68
  return df
69
 
70
 
 
80
 
81
  row = {
82
  "Submission": f"{username}/{model_name}",
83
+ "Effect ⬆️": metrics["effectiveness"],
84
+ "Part Acc": metrics["part_accuracy"],
85
+ "Question Acc": metrics["question_accuracy"],
86
+ "Calibration": metrics["calibration"],
87
+ "Adoption": metrics["adoption"],
88
  }
89
  eval_results.append(row)
90
  except Exception as e:
 
93
 
94
  df = pd.DataFrame(
95
  eval_results,
96
+ columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
97
  )
98
+ df.sort_values(by="Effect ⬆️", ascending=False, inplace=True)
99
  return df
100
 
101
 
102
+ def colour_pos_neg(v):
103
+ """Return a CSS rule for the cell that called the function."""
104
+ if pd.isna(v): # keep NaNs unstyled
105
+ return ""
106
+ return "color: green;" if v > 0 else "color: red;"
107
+
108
+
109
  def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
110
  df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
111
 
 
 
 
 
 
 
112
  # Apply formatting and styling
113
  styled_df = df.style.format(
114
  {
115
+ "Expected Score ⬆️": "{:5.2f}",
116
+ "Buzz Precision": "{:>6.1%}",
117
  "Buzz Position": "{:>6.1f}",
118
+ "Buzz Frequency": "{:>6.1f}",
119
  "Win Rate w/ Humans": "{:>6.1%}",
 
120
  }
121
+ ).map(colour_pos_neg, subset=["Expected Score ⬆️"])
122
 
123
  return styled_df if style else df
124
 
 
129
  # Apply formatting and styling
130
  styled_df = df.style.format(
131
  {
132
+ "Question Acc": "{:>6.1%}",
133
+ "Part Acc": "{:>6.1%}",
134
+ "Effect ⬆️": "{:5.2f}",
135
+ "Calibration": "{:>6.1%}",
136
+ "Adoption": "{:>6.1%}",
137
  }
138
+ ).map(colour_pos_neg, subset=["Effect ⬆️"])
139
 
140
  return styled_df if style else df
141
 
142
 
143
  # TODO: Implement this once we have the proxy server running.
144
  def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
145
+ # Helper to extract username from 'Submission' (format: username/model_name)
146
+ def extract_username(submission: str) -> str:
147
+ return submission.split("/", 1)[0] if "/" in submission else submission
148
+
149
+ # Add username columns
150
+ tossup_df = tossup_df.copy()
151
+ tossup_df["Username"] = tossup_df["Submission"].apply(extract_username)
152
+ bonus_df = bonus_df.copy()
153
+ bonus_df["Username"] = bonus_df["Submission"].apply(extract_username)
154
+
155
+ # Pick best tossup per user (highest Expected Score ⬆️)
156
+ tossup_best = tossup_df.sort_values("Expected Score ⬆️", ascending=False).drop_duplicates("Username")
157
+ tossup_best = tossup_best.set_index("Username")
158
+
159
+ # Pick best bonus per user (highest Effect ⬆️)
160
+ bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username")
161
+ bonus_best = bonus_best.set_index("Username")
162
+
163
+ # Merge on Username (outer join to include users who have only one type)
164
+ merged = pd.merge(
165
+ tossup_best,
166
+ bonus_best,
167
+ left_index=True,
168
+ right_index=True,
169
+ how="outer",
170
+ suffixes=("_tossup", "_bonus"),
171
+ )
172
 
173
+ # Compose a summary row per user
174
+ # Columns: Username, Tossup Submission, Bonus Submission, all metrics from both
175
+ leaderboard = pd.DataFrame(
176
+ {
177
+ "Username": merged.index,
178
+ "Tossup Submission": merged["Submission_tossup"].str.split("/").str[1],
179
+ "Bonus Submission": merged["Submission_bonus"].str.split("/").str[1],
180
+ "Overall Score ⬆️": merged[["Expected Score ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1),
181
+ "Expected Score (Tossup) ⬆️": merged["Expected Score ⬆️"],
182
+ "Effect (Bonus) ⬆️": merged["Effect ⬆️"],
183
+ "Part Acc (Bonus)": merged["Part Acc"],
184
+ "Adoption (Bonus)": merged["Adoption"],
185
+ }
186
+ )
187
+
188
+ leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False)
189
+
190
+ return leaderboard.reset_index(drop=True)
191
+
192
+
193
+ def fetch_overall_leaderboard(split: str = "tiny_eval", style: bool = True):
194
+ bonus_df = fetch_bonus_leaderboard(split, style=False)
195
+ tossup_df = fetch_tossup_leaderboard(split, style=False)
196
+ overall_df = create_overall_leaderboard(tossup_df, bonus_df)
197
+
198
+ # Apply formatting and styling
199
+ styled_df = overall_df.style.format(
200
+ {
201
+ "Overall Score ⬆️": "{:5.2f}",
202
+ "Expected Score (Tossup) ⬆️": "{:5.2f}",
203
+ "Effect (Bonus) ⬆️": "{:5.2f}",
204
+ "Part Acc (Bonus)": "{:>6.1%}",
205
+ "Adoption (Bonus)": "{:>6.1%}",
206
+ },
207
+ na_rep="-",
208
+ ).map(colour_pos_neg, subset=["Overall Score ⬆️"])
209
+
210
+ return styled_df if style else overall_df