xeon27 commited on
Commit
2718fde
·
1 Parent(s): 1249741

Add results for GAIA and GDM tasks

Browse files
create_log_file_map.py CHANGED
@@ -3,11 +3,13 @@ import os
3
 
4
  from collections import defaultdict
5
 
 
 
6
 
7
  def main():
8
 
9
  base_bm_input_path = "./base_benchmarking_logs"
10
- agentic_bm_input_path = "./agentic_benchmarking_logs"
11
 
12
  log_file_map = defaultdict()
13
 
@@ -20,13 +22,16 @@ def main():
20
  task_name = result["eval"]["task"].split("/")[-1]
21
  log_file_map[model_name][task_name] = task_log_file
22
 
23
- for model_name in os.listdir(agentic_bm_input_path):
24
- if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
25
- for task_log_file in os.listdir(os.path.join(agentic_bm_input_path, model_name)):
26
- with open(os.path.join(agentic_bm_input_path, model_name, task_log_file), "r") as f:
27
- result = json.load(f)
28
- task_name = result["eval"]["task"].split("/")[-1]
29
- log_file_map[model_name][task_name] = task_log_file
 
 
 
30
 
31
  with open("./inspect_log_file_names.json", "w") as f:
32
  json.dump(log_file_map, f, indent=4)
 
3
 
4
  from collections import defaultdict
5
 
6
+ from refactor_eval_results import AGENTIC_LOG_MODEL_NAME_MAP, AGENTIC_TASKS
7
+
8
 
9
  def main():
10
 
11
  base_bm_input_path = "./base_benchmarking_logs"
12
+ agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
13
 
14
  log_file_map = defaultdict()
15
 
 
22
  task_name = result["eval"]["task"].split("/")[-1]
23
  log_file_map[model_name][task_name] = task_log_file
24
 
25
+ for model_name in AGENTIC_LOG_MODEL_NAME_MAP.keys():
26
+ log_file_path = os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP[model_name])
27
+ if os.path.isdir(log_file_path):
28
+ for task in AGENTIC_TASKS:
29
+ for task_log_file in os.listdir(os.path.join(log_file_path, task)):
30
+ if task_log_file.endswith(".json"):
31
+ with open(os.path.join(log_file_path, task, task_log_file), "r") as f:
32
+ result = json.load(f)
33
+ task_name = result["eval"]["task"].split("/")[-1]
34
+ log_file_map[model_name][task_name] = task_log_file
35
 
36
  with open("./inspect_log_file_names.json", "w") as f:
37
  json.dump(log_file_map, f, indent=4)
inspect_log_file_names.json CHANGED
@@ -2,9 +2,11 @@
2
  "gemini-1.5-pro": {
3
  "mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
4
  "humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
 
5
  "mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
6
  "math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
7
  "arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
 
8
  "gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
9
  "gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
10
  "ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
@@ -12,13 +14,15 @@
12
  "arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
13
  "drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
14
  "hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
15
- "gaia": "2024-11-15T12-53-32-05-00_gaia_NvyGRTXFrFskJfUvuLwvVr.json",
16
- "gdm_intercode_ctf": "2024-11-15T16-23-23-05-00_gdm-intercode-ctf_3JrgtTMcijTUxHVaagPRYh.json"
 
17
  },
18
  "gemini-1.5-flash": {
19
  "gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
20
  "arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
21
  "math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
 
22
  "drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
23
  "mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
24
  "ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
@@ -28,20 +32,22 @@
28
  "arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
29
  "gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
30
  "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
31
- "gdm_intercode_ctf": "2024-11-15T20-52-53-05-00_gdm-intercode-ctf_oLYr3H6bFtrcmgM6EABmNt.json"
32
  },
33
  "o1": {
34
  "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
35
  "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
36
  "mmmu_open": "2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.json",
37
- "mmlu_pro": "2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.json",
38
  "math": "2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.json",
39
  "arc_easy": "2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.json",
40
  "arc_challenge": "2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.json",
41
  "gsm8k": "2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.json",
42
  "gpqa_diamond": "2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.json",
43
- "hellaswag": "2025-01-17T13-14-39-05-00_hellaswag_73sQJFnwpzWjTvEqKjUk4M.json",
44
- "mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json"
 
 
 
45
  },
46
  "claude-3-5-sonnet-20241022": {
47
  "mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
@@ -57,7 +63,10 @@
57
  "ifeval": "2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.json",
58
  "humaneval": "2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.json",
59
  "winogrande": "2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.json",
60
- "drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json"
 
 
 
61
  },
62
  "c4ai-command-r-plus": {
63
  "ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
@@ -117,7 +126,10 @@
117
  "mmmu_multiple_choice": "2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.json",
118
  "humaneval": "2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.json",
119
  "math": "2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.json",
120
- "hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json"
 
 
 
121
  },
122
  "Mistral-Large-Instruct-2407": {
123
  "drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",
 
2
  "gemini-1.5-pro": {
3
  "mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
4
  "humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
5
+ "mmmu_multiple_choice": "2025-01-20T23-16-04-05-00_mmmu-multiple-choice_NLmxmHYt6CJymRVVa5UsbD.json",
6
  "mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
7
  "math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
8
  "arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
9
+ "mmmu_open": "2025-01-20T23-19-25-05-00_mmmu-open_CDbtEQ7tjs5zkj4ScBbzod.json",
10
  "gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
11
  "gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
12
  "ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
 
14
  "arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
15
  "drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
16
  "hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
17
+ "gaia": "2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.json",
18
+ "gdm_intercode_ctf": "2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.json",
19
+ "gdm_in_house_ctf": "2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.json"
20
  },
21
  "gemini-1.5-flash": {
22
  "gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
23
  "arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
24
  "math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
25
+ "mmmu_open": "2025-01-20T23-23-50-05-00_mmmu-open_L7CnETP7d49axc7L8ChEZ4.json",
26
  "drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
27
  "mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
28
  "ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
 
32
  "arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
33
  "gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
34
  "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
35
+ "mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
36
  },
37
  "o1": {
38
  "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
39
  "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
40
  "mmmu_open": "2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.json",
 
41
  "math": "2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.json",
42
  "arc_easy": "2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.json",
43
  "arc_challenge": "2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.json",
44
  "gsm8k": "2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.json",
45
  "gpqa_diamond": "2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.json",
46
+ "mmlu_pro": "2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.json",
47
+ "mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json",
48
+ "gaia": "2025-01-22T13-42-00-05-00_o1_gaia_merged.json",
49
+ "gdm_intercode_ctf": "2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.json",
50
+ "gdm_in_house_ctf": "2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.json"
51
  },
52
  "claude-3-5-sonnet-20241022": {
53
  "mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
 
63
  "ifeval": "2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.json",
64
  "humaneval": "2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.json",
65
  "winogrande": "2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.json",
66
+ "drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json",
67
+ "gaia": "2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.json",
68
+ "gdm_intercode_ctf": "2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.json",
69
+ "gdm_in_house_ctf": "2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.json"
70
  },
71
  "c4ai-command-r-plus": {
72
  "ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
 
126
  "mmmu_multiple_choice": "2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.json",
127
  "humaneval": "2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.json",
128
  "math": "2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.json",
129
+ "hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json",
130
+ "gaia": "2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.json",
131
+ "gdm_intercode_ctf": "2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.json",
132
+ "gdm_in_house_ctf": "2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.json"
133
  },
134
  "Mistral-Large-Instruct-2407": {
135
  "drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",
refactor_eval_results.py CHANGED
@@ -20,8 +20,9 @@ METRIC_NAME = {
20
  "mmmu_open": "accuracy",
21
 
22
  # agentic
23
- "gaia": "mean",
24
  "gdm_intercode_ctf": "accuracy",
 
25
  }
26
 
27
  MODEL_SHA_MAP = {
@@ -40,8 +41,17 @@ MODEL_SHA_MAP = {
40
  "o1": "https://openai.com/o1",
41
  }
42
 
 
 
 
 
 
 
 
 
43
 
44
- def combine_eval_results(results_path: str, model_name: str) -> dict:
 
45
  results = dict(
46
  {
47
  "config": {
@@ -53,29 +63,57 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
53
  "results": {},
54
  }
55
  )
56
- for file in os.listdir(os.path.join(results_path, model_name)):
57
- if file.endswith(".json"):
58
- with open(os.path.join(results_path, model_name, file), "r") as f:
59
- try:
60
- result = json.load(f)
61
- task_name = result["eval"]["task"].split("/")[-1]
62
- if task_name == "math":
63
- metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
64
- else:
65
- metrics = result["results"]["scores"][0]["metrics"]
66
- metric_name = metrics[METRIC_NAME[task_name]]["name"]
67
- metric_value = metrics[METRIC_NAME[task_name]]["value"]
68
- results["results"].update(
69
- {
70
- task_name: {
71
- metric_name: metric_value
 
 
 
72
  }
73
- }
74
- )
75
- except KeyError as e:
76
- print(f"KeyError: {e}")
77
- print(model_name)
78
- print(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  return results
80
 
81
 
@@ -86,19 +124,21 @@ def main():
86
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
87
 
88
  base_bm_input_path = "./base_benchmarking_logs"
89
- agentic_bm_input_path = "./agentic_benchmarking_logs"
90
  os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
91
  os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
92
 
93
  for model_name in os.listdir(base_bm_input_path):
 
94
  if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
95
- results = combine_eval_results(base_bm_input_path, model_name)
96
- # TMP: Add dummy agentic benchmarks to the results
97
  for metric in METRIC_NAME.items():
98
  if metric[0] not in results["results"]:
99
  results["results"].update({metric[0]: {metric[1]: None}})
100
- if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
101
- agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name)
 
102
  results["results"].update(agentic_bm_results["results"])
103
  with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
104
  json.dump(results, f, indent=4)
 
20
  "mmmu_open": "accuracy",
21
 
22
  # agentic
23
+ "gaia": "accuracy",
24
  "gdm_intercode_ctf": "accuracy",
25
+ "gdm_in_house_ctf": "accuracy",
26
  }
27
 
28
  MODEL_SHA_MAP = {
 
41
  "o1": "https://openai.com/o1",
42
  }
43
 
44
+ AGENTIC_LOG_MODEL_NAME_MAP = {
45
+ "claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
46
+ "gemini-1.5-pro": "gemini-1.5-pro-002",
47
+ "gpt-4o": "gpt-4o-2024-08-06",
48
+ "o1": "o1-2024-12-17",
49
+ }
50
+
51
+ AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf"]
52
 
53
+
54
+ def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
55
  results = dict(
56
  {
57
  "config": {
 
63
  "results": {},
64
  }
65
  )
66
+
67
+ if type == "base":
68
+ for file in os.listdir(os.path.join(results_path, model_name)):
69
+ if file.endswith(".json"):
70
+ with open(os.path.join(results_path, model_name, file), "r") as f:
71
+ try:
72
+ result = json.load(f)
73
+ task_name = result["eval"]["task"].split("/")[-1]
74
+ if task_name == "math":
75
+ metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
76
+ else:
77
+ metrics = result["results"]["scores"][0]["metrics"]
78
+ metric_name = metrics[METRIC_NAME[task_name]]["name"]
79
+ metric_value = metrics[METRIC_NAME[task_name]]["value"]
80
+ results["results"].update(
81
+ {
82
+ task_name: {
83
+ metric_name: metric_value
84
+ }
85
  }
86
+ )
87
+ except KeyError as e:
88
+ print(f"KeyError: {e}")
89
+ print(model_name)
90
+ print(file)
91
+
92
+ elif type == "agentic":
93
+ model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
94
+ results_path = os.path.join(results_path, model_name)
95
+ for task in AGENTIC_TASKS:
96
+ for file in os.listdir(os.path.join(results_path, task)):
97
+ if file.endswith(".json"):
98
+ with open(os.path.join(results_path, task, file), "r") as f:
99
+ try:
100
+ result = json.load(f)
101
+ task_name = result["eval"]["task"].split("/")[-1]
102
+ metrics = result["results"]["scores"][0]["metrics"]
103
+ metric_name = metrics[METRIC_NAME[task_name]]["name"]
104
+ metric_value = metrics[METRIC_NAME[task_name]]["value"]
105
+ results["results"].update(
106
+ {
107
+ task_name: {
108
+ metric_name: metric_value
109
+ }
110
+ }
111
+ )
112
+ except KeyError as e:
113
+ print(f"KeyError: {e}")
114
+ print(model_name)
115
+ print(file)
116
+
117
  return results
118
 
119
 
 
124
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
125
 
126
  base_bm_input_path = "./base_benchmarking_logs"
127
+ agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
128
  os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
129
  os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
130
 
131
  for model_name in os.listdir(base_bm_input_path):
132
+
133
  if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
134
+ results = combine_eval_results(base_bm_input_path, model_name, "base")
135
+ # TMP: Add missing benchmarks to the results
136
  for metric in METRIC_NAME.items():
137
  if metric[0] not in results["results"]:
138
  results["results"].update({metric[0]: {metric[1]: None}})
139
+
140
+ if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
141
+ agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
142
  results["results"].update(agentic_bm_results["results"])
143
  with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
144
  json.dump(results, f, indent=4)
src/about.py CHANGED
@@ -32,8 +32,9 @@ class Tasks(Enum):
32
  task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
33
 
34
  # agentic
35
- task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
36
  task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
 
37
 
38
 
39
  NUM_FEWSHOT = 0 # Change with your few shot
@@ -59,7 +60,7 @@ LLM_BENCHMARKS_TEXT = f"""
59
  The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
60
 
61
  - **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
62
- - **Agentic Tasks**: GAIA, GDM-InterCode-CTF
63
 
64
  Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
65
 
@@ -93,6 +94,7 @@ Here is a closer look at each benchmark included in the leaderboard:
93
  ### Agentic Benchmarks
94
  - **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
95
  - **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
 
96
 
97
  ---
98
  """
 
32
  task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
33
 
34
  # agentic
35
+ task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
36
  task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
37
+ task16 = Task("gdm_in_house_ctf", "accuracy", "GDM-In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
38
 
39
 
40
  NUM_FEWSHOT = 0 # Change with your few shot
 
60
  The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
61
 
62
  - **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
63
+ - **Agentic Tasks**: GAIA, GDM-InterCode-CTF, GDM-In-House-CTF
64
 
65
  Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
66
 
 
94
  ### Agentic Benchmarks
95
  - **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
96
  - **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
97
+ - **GDM-In-House-CTF**: A capture-the-flag style challenge testing a variety of security skills pertaining to basic web application security.
98
 
99
  ---
100
  """
src/populate.py CHANGED
@@ -46,6 +46,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
46
  # subset for model and benchmark cols
47
  df = df[[AutoEvalColumn.model.name] + benchmark_cols]
48
 
 
 
 
49
  df = df.fillna(EMPTY_SYMBOL)
50
 
51
  # make values clickable and link to log files
 
46
  # subset for model and benchmark cols
47
  df = df[[AutoEvalColumn.model.name] + benchmark_cols]
48
 
49
+ # drop rows for which all benchmark cols are empty
50
+ df = df.dropna(subset=benchmark_cols, axis=0, how="all")
51
+
52
  df = df.fillna(EMPTY_SYMBOL)
53
 
54
  # make values clickable and link to log files