eval-leaderboard

Running

App Files Files Community

xeon27 commited on Jan 28

Commit

2718fde

1 Parent(s): 1249741

Add results for GAIA and GDM tasks

Browse files

Files changed (5) hide show

create_log_file_map.py +13 -8
inspect_log_file_names.json +20 -8
refactor_eval_results.py +69 -29
src/about.py +4 -2
src/populate.py +3 -0

create_log_file_map.py CHANGED Viewed

@@ -3,11 +3,13 @@ import os
 from collections import defaultdict
 def main():
     base_bm_input_path = "./base_benchmarking_logs"
-    agentic_bm_input_path = "./agentic_benchmarking_logs"
     log_file_map = defaultdict()
@@ -20,13 +22,16 @@ def main():
                 task_name = result["eval"]["task"].split("/")[-1]
                 log_file_map[model_name][task_name] = task_log_file
-    for model_name in os.listdir(agentic_bm_input_path):
-        if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
-            for task_log_file in os.listdir(os.path.join(agentic_bm_input_path, model_name)):
-                with open(os.path.join(agentic_bm_input_path, model_name, task_log_file), "r") as f:
-                    result = json.load(f)
-                task_name = result["eval"]["task"].split("/")[-1]
-                log_file_map[model_name][task_name] = task_log_file
     with open("./inspect_log_file_names.json", "w") as f:
         json.dump(log_file_map, f, indent=4)

 from collections import defaultdict
+from refactor_eval_results import AGENTIC_LOG_MODEL_NAME_MAP, AGENTIC_TASKS
 def main():
     base_bm_input_path = "./base_benchmarking_logs"
+    agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
     log_file_map = defaultdict()
                 task_name = result["eval"]["task"].split("/")[-1]
                 log_file_map[model_name][task_name] = task_log_file
+    for model_name in AGENTIC_LOG_MODEL_NAME_MAP.keys():
+        log_file_path = os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP[model_name])
+        if os.path.isdir(log_file_path):
+            for task in AGENTIC_TASKS:
+                for task_log_file in os.listdir(os.path.join(log_file_path, task)):
+                    if task_log_file.endswith(".json"):
+                        with open(os.path.join(log_file_path, task, task_log_file), "r") as f:
+                            result = json.load(f)
+                        task_name = result["eval"]["task"].split("/")[-1]
+                        log_file_map[model_name][task_name] = task_log_file
     with open("./inspect_log_file_names.json", "w") as f:
         json.dump(log_file_map, f, indent=4)

inspect_log_file_names.json CHANGED Viewed

@@ -2,9 +2,11 @@
     "gemini-1.5-pro": {
         "mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
         "humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
         "mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
         "math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
         "arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
         "gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
         "gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
         "ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
@@ -12,13 +14,15 @@
         "arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
         "drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
         "hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
-        "gaia": "2024-11-15T12-53-32-05-00_gaia_NvyGRTXFrFskJfUvuLwvVr.json",
-        "gdm_intercode_ctf": "2024-11-15T16-23-23-05-00_gdm-intercode-ctf_3JrgtTMcijTUxHVaagPRYh.json"
     },
     "gemini-1.5-flash": {
         "gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
         "arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
         "math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
         "drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
         "mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
         "ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
@@ -28,20 +32,22 @@
         "arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
         "gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
         "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
-        "gdm_intercode_ctf": "2024-11-15T20-52-53-05-00_gdm-intercode-ctf_oLYr3H6bFtrcmgM6EABmNt.json"
     },
     "o1": {
         "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
         "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
         "mmmu_open": "2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.json",
-        "mmlu_pro": "2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.json",
         "math": "2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.json",
         "arc_easy": "2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.json",
         "arc_challenge": "2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.json",
         "gsm8k": "2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.json",
         "gpqa_diamond": "2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.json",
-        "hellaswag": "2025-01-17T13-14-39-05-00_hellaswag_73sQJFnwpzWjTvEqKjUk4M.json",
-        "mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json"
     },
     "claude-3-5-sonnet-20241022": {
         "mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
@@ -57,7 +63,10 @@
         "ifeval": "2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.json",
         "humaneval": "2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.json",
         "winogrande": "2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.json",
-        "drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json"
     },
     "c4ai-command-r-plus": {
         "ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
@@ -117,7 +126,10 @@
         "mmmu_multiple_choice": "2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.json",
         "humaneval": "2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.json",
         "math": "2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.json",
-        "hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json"
     },
     "Mistral-Large-Instruct-2407": {
         "drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",

     "gemini-1.5-pro": {
         "mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
         "humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
+        "mmmu_multiple_choice": "2025-01-20T23-16-04-05-00_mmmu-multiple-choice_NLmxmHYt6CJymRVVa5UsbD.json",
         "mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
         "math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
         "arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
+        "mmmu_open": "2025-01-20T23-19-25-05-00_mmmu-open_CDbtEQ7tjs5zkj4ScBbzod.json",
         "gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
         "gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
         "ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
         "arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
         "drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
         "hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
+        "gaia": "2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.json",
+        "gdm_intercode_ctf": "2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.json",
+        "gdm_in_house_ctf": "2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.json"
     },
     "gemini-1.5-flash": {
         "gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
         "arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
         "math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
+        "mmmu_open": "2025-01-20T23-23-50-05-00_mmmu-open_L7CnETP7d49axc7L8ChEZ4.json",
         "drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
         "mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
         "ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
         "arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
         "gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
         "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
+        "mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
     },
     "o1": {
         "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
         "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
         "mmmu_open": "2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.json",
         "math": "2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.json",
         "arc_easy": "2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.json",
         "arc_challenge": "2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.json",
         "gsm8k": "2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.json",
         "gpqa_diamond": "2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.json",
+        "mmlu_pro": "2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.json",
+        "mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json",
+        "gaia": "2025-01-22T13-42-00-05-00_o1_gaia_merged.json",
+        "gdm_intercode_ctf": "2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.json",
+        "gdm_in_house_ctf": "2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.json"
     },
     "claude-3-5-sonnet-20241022": {
         "mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
         "ifeval": "2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.json",
         "humaneval": "2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.json",
         "winogrande": "2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.json",
+        "drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json",
+        "gaia": "2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.json",
+        "gdm_intercode_ctf": "2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.json",
+        "gdm_in_house_ctf": "2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.json"
     },
     "c4ai-command-r-plus": {
         "ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
         "mmmu_multiple_choice": "2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.json",
         "humaneval": "2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.json",
         "math": "2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.json",
+        "hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json",
+        "gaia": "2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.json",
+        "gdm_intercode_ctf": "2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.json",
+        "gdm_in_house_ctf": "2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.json"
     },
     "Mistral-Large-Instruct-2407": {
         "drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",

refactor_eval_results.py CHANGED Viewed

@@ -20,8 +20,9 @@ METRIC_NAME = {
     "mmmu_open": "accuracy",
     # agentic
-    "gaia": "mean",
     "gdm_intercode_ctf": "accuracy",
 }
 MODEL_SHA_MAP = {
@@ -40,8 +41,17 @@ MODEL_SHA_MAP = {
     "o1": "https://openai.com/o1",
 }
-def combine_eval_results(results_path: str, model_name: str) -> dict:
     results = dict(
         {
             "config": {
@@ -53,29 +63,57 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
             "results": {},
         }
     )
-    for file in os.listdir(os.path.join(results_path, model_name)):
-        if file.endswith(".json"):
-            with open(os.path.join(results_path, model_name, file), "r") as f:
-                try:
-                    result = json.load(f)
-                    task_name = result["eval"]["task"].split("/")[-1]
-                    if task_name == "math":
-                        metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
-                    else:
-                        metrics = result["results"]["scores"][0]["metrics"]
-                    metric_name = metrics[METRIC_NAME[task_name]]["name"]
-                    metric_value = metrics[METRIC_NAME[task_name]]["value"]
-                    results["results"].update(
-                        {
-                            task_name: {
-                                metric_name: metric_value
                             }
-                        }
-                    )
-                except KeyError as e:
-                    print(f"KeyError: {e}")
-                    print(model_name)
-                    print(file)
     return results
@@ -86,19 +124,21 @@ def main():
     EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
     base_bm_input_path = "./base_benchmarking_logs"
-    agentic_bm_input_path = "./agentic_benchmarking_logs"
     os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
     os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
     for model_name in os.listdir(base_bm_input_path):
         if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
-            results = combine_eval_results(base_bm_input_path, model_name)
-        # TMP: Add dummy agentic benchmarks to the results
         for metric in METRIC_NAME.items():
             if metric[0] not in results["results"]:
                 results["results"].update({metric[0]: {metric[1]: None}})
-        if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
-            agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name)
             results["results"].update(agentic_bm_results["results"])
         with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
             json.dump(results, f, indent=4)

     "mmmu_open": "accuracy",
     # agentic
+    "gaia": "accuracy",
     "gdm_intercode_ctf": "accuracy",
+    "gdm_in_house_ctf": "accuracy",
 }
 MODEL_SHA_MAP = {
     "o1": "https://openai.com/o1",
 }
+AGENTIC_LOG_MODEL_NAME_MAP = {
+    "claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
+    "gemini-1.5-pro": "gemini-1.5-pro-002",
+    "gpt-4o": "gpt-4o-2024-08-06",
+    "o1": "o1-2024-12-17",
+}
+AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf"]
+def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
     results = dict(
         {
             "config": {
             "results": {},
         }
     )
+    if type == "base":
+        for file in os.listdir(os.path.join(results_path, model_name)):
+            if file.endswith(".json"):
+                with open(os.path.join(results_path, model_name, file), "r") as f:
+                    try:
+                        result = json.load(f)
+                        task_name = result["eval"]["task"].split("/")[-1]
+                        if task_name == "math":
+                            metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
+                        else:
+                            metrics = result["results"]["scores"][0]["metrics"]
+                        metric_name = metrics[METRIC_NAME[task_name]]["name"]
+                        metric_value = metrics[METRIC_NAME[task_name]]["value"]
+                        results["results"].update(
+                            {
+                                task_name: {
+                                    metric_name: metric_value
+                                }
                             }
+                        )
+                    except KeyError as e:
+                        print(f"KeyError: {e}")
+                        print(model_name)
+                        print(file)
+    elif type == "agentic":
+        model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
+        results_path = os.path.join(results_path, model_name)
+        for task in AGENTIC_TASKS:
+            for file in os.listdir(os.path.join(results_path, task)):
+                if file.endswith(".json"):
+                    with open(os.path.join(results_path, task, file), "r") as f:
+                        try:
+                            result = json.load(f)
+                            task_name = result["eval"]["task"].split("/")[-1]
+                            metrics = result["results"]["scores"][0]["metrics"]
+                            metric_name = metrics[METRIC_NAME[task_name]]["name"]
+                            metric_value = metrics[METRIC_NAME[task_name]]["value"]
+                            results["results"].update(
+                                {
+                                    task_name: {
+                                        metric_name: metric_value
+                                    }
+                                }
+                            )
+                        except KeyError as e:
+                            print(f"KeyError: {e}")
+                            print(model_name)
+                            print(file)
     return results
     EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
     base_bm_input_path = "./base_benchmarking_logs"
+    agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
     os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
     os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
     for model_name in os.listdir(base_bm_input_path):
         if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
+            results = combine_eval_results(base_bm_input_path, model_name, "base")
+        # TMP: Add missing benchmarks to the results
         for metric in METRIC_NAME.items():
             if metric[0] not in results["results"]:
                 results["results"].update({metric[0]: {metric[1]: None}})
+        if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
+            agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
             results["results"].update(agentic_bm_results["results"])
         with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
             json.dump(results, f, indent=4)

src/about.py CHANGED Viewed

@@ -32,8 +32,9 @@ class Tasks(Enum):
     task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
     # agentic
-    task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
     task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
 NUM_FEWSHOT = 0 # Change with your few shot
@@ -59,7 +60,7 @@ LLM_BENCHMARKS_TEXT = f"""
 The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
 - **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
-- **Agentic Tasks**: GAIA, GDM-InterCode-CTF
 Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
@@ -93,6 +94,7 @@ Here is a closer look at each benchmark included in the leaderboard:
 ### Agentic Benchmarks
 - **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
 - **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
 ---
 """

     task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
     # agentic
+    task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
     task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
+    task16 = Task("gdm_in_house_ctf", "accuracy", "GDM-In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
 NUM_FEWSHOT = 0 # Change with your few shot
 The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
 - **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
+- **Agentic Tasks**: GAIA, GDM-InterCode-CTF, GDM-In-House-CTF
 Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
 ### Agentic Benchmarks
 - **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
 - **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
+- **GDM-In-House-CTF**: A capture-the-flag style challenge testing a variety of security skills pertaining to basic web application security.
 ---
 """

src/populate.py CHANGED Viewed

@@ -46,6 +46,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     # subset for model and benchmark cols
     df = df[[AutoEvalColumn.model.name] + benchmark_cols]
     df = df.fillna(EMPTY_SYMBOL)
     # make values clickable and link to log files

     # subset for model and benchmark cols
     df = df[[AutoEvalColumn.model.name] + benchmark_cols]
+    # drop rows for which all benchmark cols are empty
+    df = df.dropna(subset=benchmark_cols, axis=0, how="all")
     df = df.fillna(EMPTY_SYMBOL)
     # make values clickable and link to log files