Spaces:

qanta-challenge
/

leaderboard

Running

App Files Files Community

Maharshi Gor commited on May 20

Commit

b2cdb46

1 Parent(s): 159a0ce

Update with new metrics

Browse files

Files changed (3) hide show

app.py +14 -5
metrics_manual.md +15 -10
src/populate.py +97 -33

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from src.envs import (
 from src.hf_dataset_utils import download_dataset_snapshot
 from src.populate import (
     fetch_bonus_leaderboard,
     fetch_tossup_leaderboard,
 )
@@ -61,21 +62,22 @@ def refresh_leaderboard(split: str = "tiny_eval", style: bool = True):
     download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
     tossup_df = fetch_tossup_leaderboard(split, style)
     bonus_df = fetch_bonus_leaderboard(split, style)
-    return tossup_df, bonus_df
 def create_leaderboard_interface(app, split: str = "tiny_eval"):
     leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
     refresh_btn = gr.Button("🔄 Refresh")
-    tossup_df, bonus_df = refresh_leaderboard(split, style=False)
     gr.Markdown("## 🛎️ Tossup Round Leaderboard")
     logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
     tossup_leaderboard = Leaderboard(
         value=tossup_df,
         search_columns=["Submission"],
-        datatype=["str", "number", "number", "number", "number", "number"],
         elem_id="tossup-table",
         interactive=False,  # Ensure it's not interactive
     )
@@ -87,16 +89,23 @@ def create_leaderboard_interface(app, split: str = "tiny_eval"):
     bonus_leaderboard = Leaderboard(
         value=bonus_df,
         search_columns=["Submission"],
-        datatype=["str", "number", "number"],
         elem_id="bonus-table",
         interactive=False,  # Ensure it's not interactive
     )
     gr.on(
         triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
         fn=refresh_leaderboard,
         inputs=[gr.State(split)],
-        outputs=[tossup_leaderboard, bonus_leaderboard],
     )

 from src.hf_dataset_utils import download_dataset_snapshot
 from src.populate import (
     fetch_bonus_leaderboard,
+    fetch_overall_leaderboard,
     fetch_tossup_leaderboard,
 )
     download_dataset_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
     tossup_df = fetch_tossup_leaderboard(split, style)
     bonus_df = fetch_bonus_leaderboard(split, style)
+    overall_df = fetch_overall_leaderboard(split, style)
+    return tossup_df, bonus_df, overall_df
 def create_leaderboard_interface(app, split: str = "tiny_eval"):
     leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
     refresh_btn = gr.Button("🔄 Refresh")
+    tossup_df, bonus_df, overall_df = refresh_leaderboard(split, style=False)
     gr.Markdown("## 🛎️ Tossup Round Leaderboard")
     logger.info(f"Tossup dataframe columns: {tossup_df.columns}")
     tossup_leaderboard = Leaderboard(
         value=tossup_df,
         search_columns=["Submission"],
+        datatype=["str", "number", "number", "number", "number"],
         elem_id="tossup-table",
         interactive=False,  # Ensure it's not interactive
     )
     bonus_leaderboard = Leaderboard(
         value=bonus_df,
         search_columns=["Submission"],
+        datatype=["str", "number", "number", "number", "number", "number", "number"],
         elem_id="bonus-table",
         interactive=False,  # Ensure it's not interactive
     )
+    gr.Markdown("## 🥇 Overall Leaderboard")
+    overall_leaderboard = Leaderboard(
+        value=overall_df,
+        search_columns=["Username", "Tossup Submission", "Bonus Submission"],
+        datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
+    )
     gr.on(
         triggers=[leaderboard_timer.tick, refresh_btn.click, app.load],
         fn=refresh_leaderboard,
         inputs=[gr.State(split)],
+        outputs=[tossup_leaderboard, bonus_leaderboard, overall_leaderboard],
     )

metrics_manual.md CHANGED Viewed

@@ -4,31 +4,36 @@ This document explains the metrics displayed on the QANTA 2025 Human-AI Cooperat
 ## Tossup Round Metrics
-Tossup rounds measure an AI system's ability to answer questions as they're being read:
 | Metric | Description |
 |--------|-------------|
 | **Submission** | The username and model name of the submission (format: `username/model_name`) |
-| **Avg Score ⬆️** | Average points scored per tossup question. 10 points is the maximum score per question. -5 point for incorrect buzzes, 0 for no buzz. Positive scores (green) indicate good performance, while negative scores (red) indicate penalties for incorrect answers. |
-| **Buzz Accuracy** | Percentage of correct answers when the model decides to buzz in. Displayed as a percentage (e.g., 65.0%). |
 | **Buzz Position** | Average (token) position in the question when the model decides to answer. Lower values indicate earlier buzzing. |
-| **Win Rate w/ Humans** | Percentage of times the model successfully answers questions when competing with human players. |
 ## Bonus Round Metrics
-Bonus rounds test an AI system's ability to answer multi-part questions:
 | Metric | Description |
 |--------|-------------|
 | **Submission** | The username and model name of the submission (format: `username/model_name`) |
-| **Question Accuracy** | Percentage of bonus questions where all parts were answered correctly. |
-| **Part Accuracy** | Percentage of individual bonus question parts answered correctly across all questions. |
 ## Understanding the Competition
 QANTA (Question Answering is Not a Trivial Activity) is a competition for building AI systems that can answer quiz bowl questions. Quiz bowl is a trivia competition format with:
-1. **Tossup questions**: Paragraph-length clues read in sequence where players can buzz in at any point to answer
-2. **Bonus questions**: Multi-part questions that test depth of knowledge in related areas
-The leaderboard tracks how well AI models perform on both question types across different evaluation datasets.

 ## Tossup Round Metrics
+Tossup rounds measure an AI system's ability to answer questions as they're being read, in direct competition with human buzz points:
 | Metric | Description |
 |--------|-------------|
 | **Submission** | The username and model name of the submission (format: `username/model_name`) |
+| **Expected Score ⬆️** | Average points scored per tossup question, using the point scale: **+1 for a correct answer, -0.5 for an incorrect buzz, 0 for no buzz**. Scores are computed by simulating real competition against human buzz point data: the model only scores if it buzzes before the human, and is penalized if it buzzes incorrectly before the human. |
+| **Buzz Precision** | Percentage of correct answers when the model decides to buzz in. Displayed as a percentage (e.g., 65.0%). |
+| **Buzz Frequency** | Percentage of questions where the model buzzes in. Displayed as a percentage (e.g., 65.0%). |
 | **Buzz Position** | Average (token) position in the question when the model decides to answer. Lower values indicate earlier buzzing. |
+| **Win Rate w/ Humans** | Percentage of times the model successfully answers questions when competing with human players before the opponent correctly buzzes. |
 ## Bonus Round Metrics
+Bonus rounds test an AI system's ability to answer multi-part questions with right explanation to collaborate with another player. The leaderboard measures the model's effect on a simulated Quizbowl player (Here, `gpt-4o-mini`):
 | Metric | Description |
 |--------|-------------|
 | **Submission** | The username and model name of the submission (format: `username/model_name`) |
+| **Effect** | The overall effect of the model's responses on a target Quizbowl player's accuracy. Specifically, this is the difference between the net accuracy of a gpt-4o-mini + model team, and the gpt-4o-mini player alone, as measured on the bonus set. In the team setting, the target model samples the response, confidence and explanation to provide the final guess, while the gpt-4o-mini player uses the model's response, confidence and explanation to provide the final guess.|
+| **Question Acc** | Percentage of bonus questions where all parts were answered correctly. |
+| **Part Acc** | Percentage of individual bonus question parts answered correctly across all questions. |
+| **Calibration** | The calibration of the model's confidence in its answers. Specifically, this is calculated as the average of the absolute difference between the confidence score (between 0 and 1) and the binary correctness score (1 for correct, 0 for incorrect), over the bonus set. |
+| **Adoption** | The percentage of times the target model adopts the model's guess, confidence and explanation to provide the final guess, as opposed to using its own. |
 ## Understanding the Competition
 QANTA (Question Answering is Not a Trivial Activity) is a competition for building AI systems that can answer quiz bowl questions. Quiz bowl is a trivia competition format with:
+1. **Tossup questions**: Paragraph-length clues read in sequence where players can buzz in at any point to answer. The leaderboard simulates real competition by using human buzz point data for scoring.
+2. **Bonus questions**: Multi-part questions that test depth of knowledge in related areas. The leaderboard measures the effect of models in a team setting with a simulated human (gpt-4o-mini).
+The leaderboard tracks how well AI models perform on both question types across different evaluation datasets, using these updated, competition-realistic metrics.

src/populate.py CHANGED Viewed

@@ -39,20 +39,15 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
             metrics = result["metrics"]
             username = result["username"]
             model_name = result["model_name"]
-            buzz_accuracy = metrics["buzz_accuracy"]
             row = {
                 "Submission": f"{username}/{model_name}",
-                "Avg Score ⬆️": metrics["tossup_score"],
-                "Buzz Accuracy": buzz_accuracy,
                 "Buzz Position": metrics["buzz_position"],
             }
-            if "human_win_rate" in metrics:
-                row["Win Rate w/ Humans"] = metrics["human_win_rate"]
-                # row["Win Rate w/ Humans (Aggressive)"] = metrics["human_win_rate_strict"]
-            else:
-                row["Win Rate w/ Humans"] = None
-                # row["Win Rate w/ Humans (Aggressive)"] = None
             eval_results.append(row)
         except Exception as e:
             logger.error(f"Error processing model result '{username}/{model_name}': {e}")
@@ -62,14 +57,14 @@ def get_tossups_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
         eval_results,
         columns=[
             "Submission",
-            "Avg Score ⬆️",
-            "Buzz Accuracy",
             "Buzz Position",
             "Win Rate w/ Humans",
-            # "Win Rate w/ Humans (Aggressive)",
         ],
     )
-    df.sort_values(by="Avg Score ⬆️", ascending=False, inplace=True)
     return df
@@ -85,8 +80,11 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
             row = {
                 "Submission": f"{username}/{model_name}",
-                "Question Accuracy": metrics["question_accuracy"],
-                "Part Accuracy": metrics["part_accuracy"],
             }
             eval_results.append(row)
         except Exception as e:
@@ -95,31 +93,32 @@ def get_bonuses_leaderboard_df(repo_dir: str, eval_split: str) -> pd.DataFrame:
     df = pd.DataFrame(
         eval_results,
-        columns=["Submission", "Question Accuracy", "Part Accuracy"],
     )
-    df.sort_values(by="Question Accuracy", ascending=False, inplace=True)
     return df
 def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
     df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
-    def colour_pos_neg(v):
-        """Return a CSS rule for the cell that called the function."""
-        if pd.isna(v):  # keep NaNs unstyled
-            return ""
-        return "color: green;" if v > 0 else "color: red;"
     # Apply formatting and styling
     styled_df = df.style.format(
         {
-            "Avg Score ⬆️": "{:5.2f}",
-            "Buzz Accuracy": "{:>6.1%}",
             "Buzz Position": "{:>6.1f}",
             "Win Rate w/ Humans": "{:>6.1%}",
-            # "Win Rate w/ Humans (Aggressive)": "{:>6.1%}",
         }
-    ).map(colour_pos_neg, subset=["Avg Score ⬆️"])
     return styled_df if style else df
@@ -130,17 +129,82 @@ def fetch_bonus_leaderboard(split: str = "tiny_eval", style: bool = True):
     # Apply formatting and styling
     styled_df = df.style.format(
         {
-            "Question Accuracy": "{:>6.1%}",
-            "Part Accuracy": "{:>6.1%}",
         }
-    )
     return styled_df if style else df
 # TODO: Implement this once we have the proxy server running.
 def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
-    # Merge the two dataframes on the 'Submission' column
-    merged_df = pd.merge(tossup_df, bonus_df, on="Submission", how="outer")
-    # Calculate the overall score as a weighted average

             metrics = result["metrics"]
             username = result["username"]
             model_name = result["model_name"]
             row = {
                 "Submission": f"{username}/{model_name}",
+                "Expected Score ⬆️": metrics["expected_score"],
+                "Buzz Precision": metrics["buzz_accuracy"],
+                "Buzz Frequency": metrics["buzz_frequency"],
                 "Buzz Position": metrics["buzz_position"],
+                "Win Rate w/ Humans": metrics.get("human_win_rate", None),
             }
             eval_results.append(row)
         except Exception as e:
             logger.error(f"Error processing model result '{username}/{model_name}': {e}")
         eval_results,
         columns=[
             "Submission",
+            "Expected Score ⬆️",
+            "Buzz Precision",
+            "Buzz Frequency",
             "Buzz Position",
             "Win Rate w/ Humans",
         ],
     )
+    df.sort_values(by="Expected Score ⬆️", ascending=False, inplace=True)
     return df
             row = {
                 "Submission": f"{username}/{model_name}",
+                "Effect ⬆️": metrics["effectiveness"],
+                "Part Acc": metrics["part_accuracy"],
+                "Question Acc": metrics["question_accuracy"],
+                "Calibration": metrics["calibration"],
+                "Adoption": metrics["adoption"],
             }
             eval_results.append(row)
         except Exception as e:
     df = pd.DataFrame(
         eval_results,
+        columns=["Submission", "Effect ⬆️", "Part Acc", "Question Acc", "Calibration", "Adoption"],
     )
+    df.sort_values(by="Effect ⬆️", ascending=False, inplace=True)
     return df
+def colour_pos_neg(v):
+    """Return a CSS rule for the cell that called the function."""
+    if pd.isna(v):  # keep NaNs unstyled
+        return ""
+    return "color: green;" if v > 0 else "color: red;"
 def fetch_tossup_leaderboard(split: str = "tiny_eval", style: bool = True):
     df = get_tossups_leaderboard_df(EVAL_RESULTS_PATH, split)
     # Apply formatting and styling
     styled_df = df.style.format(
         {
+            "Expected Score ⬆️": "{:5.2f}",
+            "Buzz Precision": "{:>6.1%}",
             "Buzz Position": "{:>6.1f}",
+            "Buzz Frequency": "{:>6.1f}",
             "Win Rate w/ Humans": "{:>6.1%}",
         }
+    ).map(colour_pos_neg, subset=["Expected Score ⬆️"])
     return styled_df if style else df
     # Apply formatting and styling
     styled_df = df.style.format(
         {
+            "Question Acc": "{:>6.1%}",
+            "Part Acc": "{:>6.1%}",
+            "Effect ⬆️": "{:5.2f}",
+            "Calibration": "{:>6.1%}",
+            "Adoption": "{:>6.1%}",
         }
+    ).map(colour_pos_neg, subset=["Effect ⬆️"])
     return styled_df if style else df
 # TODO: Implement this once we have the proxy server running.
 def create_overall_leaderboard(tossup_df: pd.DataFrame, bonus_df: pd.DataFrame) -> pd.DataFrame:
+    # Helper to extract username from 'Submission' (format: username/model_name)
+    def extract_username(submission: str) -> str:
+        return submission.split("/", 1)[0] if "/" in submission else submission
+    # Add username columns
+    tossup_df = tossup_df.copy()
+    tossup_df["Username"] = tossup_df["Submission"].apply(extract_username)
+    bonus_df = bonus_df.copy()
+    bonus_df["Username"] = bonus_df["Submission"].apply(extract_username)
+    # Pick best tossup per user (highest Expected Score ⬆️)
+    tossup_best = tossup_df.sort_values("Expected Score ⬆️", ascending=False).drop_duplicates("Username")
+    tossup_best = tossup_best.set_index("Username")
+    # Pick best bonus per user (highest Effect ⬆️)
+    bonus_best = bonus_df.sort_values("Effect ⬆️", ascending=False).drop_duplicates("Username")
+    bonus_best = bonus_best.set_index("Username")
+    # Merge on Username (outer join to include users who have only one type)
+    merged = pd.merge(
+        tossup_best,
+        bonus_best,
+        left_index=True,
+        right_index=True,
+        how="outer",
+        suffixes=("_tossup", "_bonus"),
+    )
+    # Compose a summary row per user
+    # Columns: Username, Tossup Submission, Bonus Submission, all metrics from both
+    leaderboard = pd.DataFrame(
+        {
+            "Username": merged.index,
+            "Tossup Submission": merged["Submission_tossup"].str.split("/").str[1],
+            "Bonus Submission": merged["Submission_bonus"].str.split("/").str[1],
+            "Overall Score ⬆️": merged[["Expected Score ⬆️", "Effect ⬆️"]].fillna(0).sum(axis=1),
+            "Expected Score (Tossup) ⬆️": merged["Expected Score ⬆️"],
+            "Effect (Bonus) ⬆️": merged["Effect ⬆️"],
+            "Part Acc (Bonus)": merged["Part Acc"],
+            "Adoption (Bonus)": merged["Adoption"],
+        }
+    )
+    leaderboard = leaderboard.sort_values("Overall Score ⬆️", ascending=False)
+    return leaderboard.reset_index(drop=True)
+def fetch_overall_leaderboard(split: str = "tiny_eval", style: bool = True):
+    bonus_df = fetch_bonus_leaderboard(split, style=False)
+    tossup_df = fetch_tossup_leaderboard(split, style=False)
+    overall_df = create_overall_leaderboard(tossup_df, bonus_df)
+    # Apply formatting and styling
+    styled_df = overall_df.style.format(
+        {
+            "Overall Score ⬆️": "{:5.2f}",
+            "Expected Score (Tossup) ⬆️": "{:5.2f}",
+            "Effect (Bonus) ⬆️": "{:5.2f}",
+            "Part Acc (Bonus)": "{:>6.1%}",
+            "Adoption (Bonus)": "{:>6.1%}",
+        },
+        na_rep="-",
+    ).map(colour_pos_neg, subset=["Overall Score ⬆️"])
+    return styled_df if style else overall_df