xeon27
commited on
Commit
·
2718fde
1
Parent(s):
1249741
Add results for GAIA and GDM tasks
Browse files- create_log_file_map.py +13 -8
- inspect_log_file_names.json +20 -8
- refactor_eval_results.py +69 -29
- src/about.py +4 -2
- src/populate.py +3 -0
create_log_file_map.py
CHANGED
@@ -3,11 +3,13 @@ import os
|
|
3 |
|
4 |
from collections import defaultdict
|
5 |
|
|
|
|
|
6 |
|
7 |
def main():
|
8 |
|
9 |
base_bm_input_path = "./base_benchmarking_logs"
|
10 |
-
agentic_bm_input_path = "
|
11 |
|
12 |
log_file_map = defaultdict()
|
13 |
|
@@ -20,13 +22,16 @@ def main():
|
|
20 |
task_name = result["eval"]["task"].split("/")[-1]
|
21 |
log_file_map[model_name][task_name] = task_log_file
|
22 |
|
23 |
-
for model_name in
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
30 |
|
31 |
with open("./inspect_log_file_names.json", "w") as f:
|
32 |
json.dump(log_file_map, f, indent=4)
|
|
|
3 |
|
4 |
from collections import defaultdict
|
5 |
|
6 |
+
from refactor_eval_results import AGENTIC_LOG_MODEL_NAME_MAP, AGENTIC_TASKS
|
7 |
+
|
8 |
|
9 |
def main():
|
10 |
|
11 |
base_bm_input_path = "./base_benchmarking_logs"
|
12 |
+
agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
|
13 |
|
14 |
log_file_map = defaultdict()
|
15 |
|
|
|
22 |
task_name = result["eval"]["task"].split("/")[-1]
|
23 |
log_file_map[model_name][task_name] = task_log_file
|
24 |
|
25 |
+
for model_name in AGENTIC_LOG_MODEL_NAME_MAP.keys():
|
26 |
+
log_file_path = os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP[model_name])
|
27 |
+
if os.path.isdir(log_file_path):
|
28 |
+
for task in AGENTIC_TASKS:
|
29 |
+
for task_log_file in os.listdir(os.path.join(log_file_path, task)):
|
30 |
+
if task_log_file.endswith(".json"):
|
31 |
+
with open(os.path.join(log_file_path, task, task_log_file), "r") as f:
|
32 |
+
result = json.load(f)
|
33 |
+
task_name = result["eval"]["task"].split("/")[-1]
|
34 |
+
log_file_map[model_name][task_name] = task_log_file
|
35 |
|
36 |
with open("./inspect_log_file_names.json", "w") as f:
|
37 |
json.dump(log_file_map, f, indent=4)
|
inspect_log_file_names.json
CHANGED
@@ -2,9 +2,11 @@
|
|
2 |
"gemini-1.5-pro": {
|
3 |
"mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
|
4 |
"humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
|
|
|
5 |
"mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
|
6 |
"math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
|
7 |
"arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
|
|
|
8 |
"gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
|
9 |
"gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
|
10 |
"ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
|
@@ -12,13 +14,15 @@
|
|
12 |
"arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
|
13 |
"drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
|
14 |
"hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
|
15 |
-
"gaia": "
|
16 |
-
"gdm_intercode_ctf": "
|
|
|
17 |
},
|
18 |
"gemini-1.5-flash": {
|
19 |
"gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
|
20 |
"arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
|
21 |
"math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
|
|
|
22 |
"drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
|
23 |
"mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
|
24 |
"ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
|
@@ -28,20 +32,22 @@
|
|
28 |
"arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
|
29 |
"gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
|
30 |
"mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
|
31 |
-
"
|
32 |
},
|
33 |
"o1": {
|
34 |
"winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
|
35 |
"humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
|
36 |
"mmmu_open": "2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.json",
|
37 |
-
"mmlu_pro": "2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.json",
|
38 |
"math": "2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.json",
|
39 |
"arc_easy": "2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.json",
|
40 |
"arc_challenge": "2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.json",
|
41 |
"gsm8k": "2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.json",
|
42 |
"gpqa_diamond": "2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.json",
|
43 |
-
"
|
44 |
-
"mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json"
|
|
|
|
|
|
|
45 |
},
|
46 |
"claude-3-5-sonnet-20241022": {
|
47 |
"mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
|
@@ -57,7 +63,10 @@
|
|
57 |
"ifeval": "2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.json",
|
58 |
"humaneval": "2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.json",
|
59 |
"winogrande": "2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.json",
|
60 |
-
"drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json"
|
|
|
|
|
|
|
61 |
},
|
62 |
"c4ai-command-r-plus": {
|
63 |
"ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
|
@@ -117,7 +126,10 @@
|
|
117 |
"mmmu_multiple_choice": "2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.json",
|
118 |
"humaneval": "2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.json",
|
119 |
"math": "2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.json",
|
120 |
-
"hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json"
|
|
|
|
|
|
|
121 |
},
|
122 |
"Mistral-Large-Instruct-2407": {
|
123 |
"drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",
|
|
|
2 |
"gemini-1.5-pro": {
|
3 |
"mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
|
4 |
"humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
|
5 |
+
"mmmu_multiple_choice": "2025-01-20T23-16-04-05-00_mmmu-multiple-choice_NLmxmHYt6CJymRVVa5UsbD.json",
|
6 |
"mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
|
7 |
"math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
|
8 |
"arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
|
9 |
+
"mmmu_open": "2025-01-20T23-19-25-05-00_mmmu-open_CDbtEQ7tjs5zkj4ScBbzod.json",
|
10 |
"gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
|
11 |
"gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
|
12 |
"ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
|
|
|
14 |
"arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
|
15 |
"drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
|
16 |
"hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
|
17 |
+
"gaia": "2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.json",
|
18 |
+
"gdm_intercode_ctf": "2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.json",
|
19 |
+
"gdm_in_house_ctf": "2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.json"
|
20 |
},
|
21 |
"gemini-1.5-flash": {
|
22 |
"gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
|
23 |
"arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
|
24 |
"math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
|
25 |
+
"mmmu_open": "2025-01-20T23-23-50-05-00_mmmu-open_L7CnETP7d49axc7L8ChEZ4.json",
|
26 |
"drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
|
27 |
"mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
|
28 |
"ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
|
|
|
32 |
"arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
|
33 |
"gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
|
34 |
"mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
|
35 |
+
"mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
|
36 |
},
|
37 |
"o1": {
|
38 |
"winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
|
39 |
"humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
|
40 |
"mmmu_open": "2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.json",
|
|
|
41 |
"math": "2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.json",
|
42 |
"arc_easy": "2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.json",
|
43 |
"arc_challenge": "2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.json",
|
44 |
"gsm8k": "2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.json",
|
45 |
"gpqa_diamond": "2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.json",
|
46 |
+
"mmlu_pro": "2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.json",
|
47 |
+
"mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json",
|
48 |
+
"gaia": "2025-01-22T13-42-00-05-00_o1_gaia_merged.json",
|
49 |
+
"gdm_intercode_ctf": "2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.json",
|
50 |
+
"gdm_in_house_ctf": "2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.json"
|
51 |
},
|
52 |
"claude-3-5-sonnet-20241022": {
|
53 |
"mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
|
|
|
63 |
"ifeval": "2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.json",
|
64 |
"humaneval": "2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.json",
|
65 |
"winogrande": "2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.json",
|
66 |
+
"drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json",
|
67 |
+
"gaia": "2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.json",
|
68 |
+
"gdm_intercode_ctf": "2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.json",
|
69 |
+
"gdm_in_house_ctf": "2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.json"
|
70 |
},
|
71 |
"c4ai-command-r-plus": {
|
72 |
"ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
|
|
|
126 |
"mmmu_multiple_choice": "2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.json",
|
127 |
"humaneval": "2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.json",
|
128 |
"math": "2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.json",
|
129 |
+
"hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json",
|
130 |
+
"gaia": "2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.json",
|
131 |
+
"gdm_intercode_ctf": "2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.json",
|
132 |
+
"gdm_in_house_ctf": "2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.json"
|
133 |
},
|
134 |
"Mistral-Large-Instruct-2407": {
|
135 |
"drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",
|
refactor_eval_results.py
CHANGED
@@ -20,8 +20,9 @@ METRIC_NAME = {
|
|
20 |
"mmmu_open": "accuracy",
|
21 |
|
22 |
# agentic
|
23 |
-
"gaia": "
|
24 |
"gdm_intercode_ctf": "accuracy",
|
|
|
25 |
}
|
26 |
|
27 |
MODEL_SHA_MAP = {
|
@@ -40,8 +41,17 @@ MODEL_SHA_MAP = {
|
|
40 |
"o1": "https://openai.com/o1",
|
41 |
}
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
|
|
45 |
results = dict(
|
46 |
{
|
47 |
"config": {
|
@@ -53,29 +63,57 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
|
|
53 |
"results": {},
|
54 |
}
|
55 |
)
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
72 |
}
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
return results
|
80 |
|
81 |
|
@@ -86,19 +124,21 @@ def main():
|
|
86 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
87 |
|
88 |
base_bm_input_path = "./base_benchmarking_logs"
|
89 |
-
agentic_bm_input_path = "
|
90 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
91 |
os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
|
92 |
|
93 |
for model_name in os.listdir(base_bm_input_path):
|
|
|
94 |
if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
|
95 |
-
results = combine_eval_results(base_bm_input_path, model_name)
|
96 |
-
# TMP: Add
|
97 |
for metric in METRIC_NAME.items():
|
98 |
if metric[0] not in results["results"]:
|
99 |
results["results"].update({metric[0]: {metric[1]: None}})
|
100 |
-
|
101 |
-
|
|
|
102 |
results["results"].update(agentic_bm_results["results"])
|
103 |
with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
|
104 |
json.dump(results, f, indent=4)
|
|
|
20 |
"mmmu_open": "accuracy",
|
21 |
|
22 |
# agentic
|
23 |
+
"gaia": "accuracy",
|
24 |
"gdm_intercode_ctf": "accuracy",
|
25 |
+
"gdm_in_house_ctf": "accuracy",
|
26 |
}
|
27 |
|
28 |
MODEL_SHA_MAP = {
|
|
|
41 |
"o1": "https://openai.com/o1",
|
42 |
}
|
43 |
|
44 |
+
AGENTIC_LOG_MODEL_NAME_MAP = {
|
45 |
+
"claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
|
46 |
+
"gemini-1.5-pro": "gemini-1.5-pro-002",
|
47 |
+
"gpt-4o": "gpt-4o-2024-08-06",
|
48 |
+
"o1": "o1-2024-12-17",
|
49 |
+
}
|
50 |
+
|
51 |
+
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf"]
|
52 |
|
53 |
+
|
54 |
+
def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
|
55 |
results = dict(
|
56 |
{
|
57 |
"config": {
|
|
|
63 |
"results": {},
|
64 |
}
|
65 |
)
|
66 |
+
|
67 |
+
if type == "base":
|
68 |
+
for file in os.listdir(os.path.join(results_path, model_name)):
|
69 |
+
if file.endswith(".json"):
|
70 |
+
with open(os.path.join(results_path, model_name, file), "r") as f:
|
71 |
+
try:
|
72 |
+
result = json.load(f)
|
73 |
+
task_name = result["eval"]["task"].split("/")[-1]
|
74 |
+
if task_name == "math":
|
75 |
+
metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
|
76 |
+
else:
|
77 |
+
metrics = result["results"]["scores"][0]["metrics"]
|
78 |
+
metric_name = metrics[METRIC_NAME[task_name]]["name"]
|
79 |
+
metric_value = metrics[METRIC_NAME[task_name]]["value"]
|
80 |
+
results["results"].update(
|
81 |
+
{
|
82 |
+
task_name: {
|
83 |
+
metric_name: metric_value
|
84 |
+
}
|
85 |
}
|
86 |
+
)
|
87 |
+
except KeyError as e:
|
88 |
+
print(f"KeyError: {e}")
|
89 |
+
print(model_name)
|
90 |
+
print(file)
|
91 |
+
|
92 |
+
elif type == "agentic":
|
93 |
+
model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
|
94 |
+
results_path = os.path.join(results_path, model_name)
|
95 |
+
for task in AGENTIC_TASKS:
|
96 |
+
for file in os.listdir(os.path.join(results_path, task)):
|
97 |
+
if file.endswith(".json"):
|
98 |
+
with open(os.path.join(results_path, task, file), "r") as f:
|
99 |
+
try:
|
100 |
+
result = json.load(f)
|
101 |
+
task_name = result["eval"]["task"].split("/")[-1]
|
102 |
+
metrics = result["results"]["scores"][0]["metrics"]
|
103 |
+
metric_name = metrics[METRIC_NAME[task_name]]["name"]
|
104 |
+
metric_value = metrics[METRIC_NAME[task_name]]["value"]
|
105 |
+
results["results"].update(
|
106 |
+
{
|
107 |
+
task_name: {
|
108 |
+
metric_name: metric_value
|
109 |
+
}
|
110 |
+
}
|
111 |
+
)
|
112 |
+
except KeyError as e:
|
113 |
+
print(f"KeyError: {e}")
|
114 |
+
print(model_name)
|
115 |
+
print(file)
|
116 |
+
|
117 |
return results
|
118 |
|
119 |
|
|
|
124 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
125 |
|
126 |
base_bm_input_path = "./base_benchmarking_logs"
|
127 |
+
agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
|
128 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
129 |
os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
|
130 |
|
131 |
for model_name in os.listdir(base_bm_input_path):
|
132 |
+
|
133 |
if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
|
134 |
+
results = combine_eval_results(base_bm_input_path, model_name, "base")
|
135 |
+
# TMP: Add missing benchmarks to the results
|
136 |
for metric in METRIC_NAME.items():
|
137 |
if metric[0] not in results["results"]:
|
138 |
results["results"].update({metric[0]: {metric[1]: None}})
|
139 |
+
|
140 |
+
if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
|
141 |
+
agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
|
142 |
results["results"].update(agentic_bm_results["results"])
|
143 |
with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
|
144 |
json.dump(results, f, indent=4)
|
src/about.py
CHANGED
@@ -32,8 +32,9 @@ class Tasks(Enum):
|
|
32 |
task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
|
33 |
|
34 |
# agentic
|
35 |
-
task14 = Task("gaia", "
|
36 |
task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
|
|
|
37 |
|
38 |
|
39 |
NUM_FEWSHOT = 0 # Change with your few shot
|
@@ -59,7 +60,7 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
59 |
The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
|
60 |
|
61 |
- **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
|
62 |
-
- **Agentic Tasks**: GAIA, GDM-InterCode-CTF
|
63 |
|
64 |
Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
|
65 |
|
@@ -93,6 +94,7 @@ Here is a closer look at each benchmark included in the leaderboard:
|
|
93 |
### Agentic Benchmarks
|
94 |
- **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
|
95 |
- **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
|
|
|
96 |
|
97 |
---
|
98 |
"""
|
|
|
32 |
task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
|
33 |
|
34 |
# agentic
|
35 |
+
task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
36 |
task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
|
37 |
+
task16 = Task("gdm_in_house_ctf", "accuracy", "GDM-In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
|
38 |
|
39 |
|
40 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
60 |
The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
|
61 |
|
62 |
- **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
|
63 |
+
- **Agentic Tasks**: GAIA, GDM-InterCode-CTF, GDM-In-House-CTF
|
64 |
|
65 |
Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
|
66 |
|
|
|
94 |
### Agentic Benchmarks
|
95 |
- **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
|
96 |
- **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
|
97 |
+
- **GDM-In-House-CTF**: A capture-the-flag style challenge testing a variety of security skills pertaining to basic web application security.
|
98 |
|
99 |
---
|
100 |
"""
|
src/populate.py
CHANGED
@@ -46,6 +46,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
46 |
# subset for model and benchmark cols
|
47 |
df = df[[AutoEvalColumn.model.name] + benchmark_cols]
|
48 |
|
|
|
|
|
|
|
49 |
df = df.fillna(EMPTY_SYMBOL)
|
50 |
|
51 |
# make values clickable and link to log files
|
|
|
46 |
# subset for model and benchmark cols
|
47 |
df = df[[AutoEvalColumn.model.name] + benchmark_cols]
|
48 |
|
49 |
+
# drop rows for which all benchmark cols are empty
|
50 |
+
df = df.dropna(subset=benchmark_cols, axis=0, how="all")
|
51 |
+
|
52 |
df = df.fillna(EMPTY_SYMBOL)
|
53 |
|
54 |
# make values clickable and link to log files
|