xeon27
commited on
Commit
·
1289818
1
Parent(s):
d5a5b95
Add agentharm and swe-bench tasks
Browse files- inspect_log_file_names.json +16 -4
- refactor_eval_results.py +5 -2
- src/about.py +6 -1
inspect_log_file_names.json
CHANGED
@@ -16,7 +16,10 @@
|
|
16 |
"hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
|
17 |
"gaia": "2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.json",
|
18 |
"gdm_intercode_ctf": "2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.json",
|
19 |
-
"gdm_in_house_ctf": "2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.json"
|
|
|
|
|
|
|
20 |
},
|
21 |
"gemini-1.5-flash": {
|
22 |
"gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
|
@@ -47,7 +50,10 @@
|
|
47 |
"mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json",
|
48 |
"gaia": "2025-01-22T13-42-00-05-00_o1_gaia_merged.json",
|
49 |
"gdm_intercode_ctf": "2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.json",
|
50 |
-
"gdm_in_house_ctf": "2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.json"
|
|
|
|
|
|
|
51 |
},
|
52 |
"claude-3-5-sonnet-20241022": {
|
53 |
"mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
|
@@ -66,7 +72,10 @@
|
|
66 |
"drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json",
|
67 |
"gaia": "2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.json",
|
68 |
"gdm_intercode_ctf": "2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.json",
|
69 |
-
"gdm_in_house_ctf": "2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.json"
|
|
|
|
|
|
|
70 |
},
|
71 |
"c4ai-command-r-plus": {
|
72 |
"ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
|
@@ -129,7 +138,10 @@
|
|
129 |
"hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json",
|
130 |
"gaia": "2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.json",
|
131 |
"gdm_intercode_ctf": "2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.json",
|
132 |
-
"gdm_in_house_ctf": "2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.json"
|
|
|
|
|
|
|
133 |
},
|
134 |
"Mistral-Large-Instruct-2407": {
|
135 |
"drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",
|
|
|
16 |
"hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
|
17 |
"gaia": "2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.json",
|
18 |
"gdm_intercode_ctf": "2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.json",
|
19 |
+
"gdm_in_house_ctf": "2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.json",
|
20 |
+
"agentharm_benign": "2025-01-21T13-18-51-08-00_agentharm-benign_gP3pQPxAuCtFLiHzt2Egt7.json",
|
21 |
+
"agentharm": "2025-01-21T12-45-43-08-00_agentharm_VmD26soLwmRgWPo3hpRHBr.json",
|
22 |
+
"swe_bench": "2025-01-22T03-00-08+00-00_google-gemini-1.5-pro.json"
|
23 |
},
|
24 |
"gemini-1.5-flash": {
|
25 |
"gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
|
|
|
50 |
"mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json",
|
51 |
"gaia": "2025-01-22T13-42-00-05-00_o1_gaia_merged.json",
|
52 |
"gdm_intercode_ctf": "2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.json",
|
53 |
+
"gdm_in_house_ctf": "2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.json",
|
54 |
+
"agentharm": "2025-01-21T09-05-42-08-00_agentharm_UGDq2yJeLAnPH6p7FgDgD8.json",
|
55 |
+
"agentharm_benign": "2025-01-21T18-20-15-08-00_agentharm-benign_bkW2Bf5xLyDQdNtfLdjCpJ.json",
|
56 |
+
"swe_bench": "2025-01-21T17-42-11+00-00_openai-o1.json"
|
57 |
},
|
58 |
"claude-3-5-sonnet-20241022": {
|
59 |
"mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
|
|
|
72 |
"drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json",
|
73 |
"gaia": "2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.json",
|
74 |
"gdm_intercode_ctf": "2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.json",
|
75 |
+
"gdm_in_house_ctf": "2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.json",
|
76 |
+
"agentharm_benign": "2025-01-21T15-09-48-08-00_agentharm-benign_A3uBBWNvv88P5BsgqwFCfg.json",
|
77 |
+
"agentharm": "2025-01-15T08-05-14-08-00_agentharm_VJGhWKLrVLdQczBZVgCXHc.json",
|
78 |
+
"swe_bench": "2025-01-16T18-56-55+00-00_anthropic-claude-3-5-sonnet.json"
|
79 |
},
|
80 |
"c4ai-command-r-plus": {
|
81 |
"ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
|
|
|
138 |
"hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json",
|
139 |
"gaia": "2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.json",
|
140 |
"gdm_intercode_ctf": "2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.json",
|
141 |
+
"gdm_in_house_ctf": "2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.json",
|
142 |
+
"agentharm": "2025-01-07T16-34-15-08-00_agentharm_UfSoyHEAH2E5RVdrPVUemy.json",
|
143 |
+
"agentharm_benign": "2025-01-21T13-45-18-08-00_agentharm-benign_8DhGJqEAvw6o8uCv4a4dVz.json",
|
144 |
+
"swe_bench": "2025-01-14T23-09-10+00-00_openai-gpt-4o.json"
|
145 |
},
|
146 |
"Mistral-Large-Instruct-2407": {
|
147 |
"drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",
|
refactor_eval_results.py
CHANGED
@@ -23,6 +23,9 @@ METRIC_NAME = {
|
|
23 |
"gaia": "accuracy",
|
24 |
"gdm_intercode_ctf": "accuracy",
|
25 |
"gdm_in_house_ctf": "accuracy",
|
|
|
|
|
|
|
26 |
}
|
27 |
|
28 |
MODEL_SHA_MAP = {
|
@@ -48,7 +51,7 @@ AGENTIC_LOG_MODEL_NAME_MAP = {
|
|
48 |
"o1": "o1-2024-12-17",
|
49 |
}
|
50 |
|
51 |
-
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf"]
|
52 |
|
53 |
|
54 |
def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
|
@@ -100,7 +103,7 @@ def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict
|
|
100 |
result = json.load(f)
|
101 |
task_name = result["eval"]["task"].split("/")[-1]
|
102 |
metrics = result["results"]["scores"][0]["metrics"]
|
103 |
-
metric_name = metrics[METRIC_NAME[task_name]]["name"]
|
104 |
metric_value = metrics[METRIC_NAME[task_name]]["value"]
|
105 |
results["results"].update(
|
106 |
{
|
|
|
23 |
"gaia": "accuracy",
|
24 |
"gdm_intercode_ctf": "accuracy",
|
25 |
"gdm_in_house_ctf": "accuracy",
|
26 |
+
"agentharm": "avg_score",
|
27 |
+
"agentharm_benign": "avg_score",
|
28 |
+
"swe_bench": "mean",
|
29 |
}
|
30 |
|
31 |
MODEL_SHA_MAP = {
|
|
|
51 |
"o1": "o1-2024-12-17",
|
52 |
}
|
53 |
|
54 |
+
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
|
55 |
|
56 |
|
57 |
def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
|
|
|
103 |
result = json.load(f)
|
104 |
task_name = result["eval"]["task"].split("/")[-1]
|
105 |
metrics = result["results"]["scores"][0]["metrics"]
|
106 |
+
metric_name = metrics[METRIC_NAME[task_name]]["name"].split("/")[-1]
|
107 |
metric_value = metrics[METRIC_NAME[task_name]]["value"]
|
108 |
results["results"].update(
|
109 |
{
|
src/about.py
CHANGED
@@ -35,6 +35,9 @@ class Tasks(Enum):
|
|
35 |
task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
36 |
task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
|
37 |
task16 = Task("gdm_in_house_ctf", "accuracy", "GDM-In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
|
|
|
|
|
|
|
38 |
|
39 |
|
40 |
NUM_FEWSHOT = 0 # Change with your few shot
|
@@ -60,7 +63,7 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
60 |
The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
|
61 |
|
62 |
- **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
|
63 |
-
- **Agentic Tasks**: GAIA, GDM-InterCode-CTF, GDM-In-House-CTF
|
64 |
|
65 |
Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
|
66 |
|
@@ -95,6 +98,8 @@ Here is a closer look at each benchmark included in the leaderboard:
|
|
95 |
- **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
|
96 |
- **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
|
97 |
- **GDM-In-House-CTF**: A capture-the-flag style challenge testing a variety of security skills pertaining to basic web application security.
|
|
|
|
|
98 |
|
99 |
---
|
100 |
"""
|
|
|
35 |
task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
36 |
task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
|
37 |
task16 = Task("gdm_in_house_ctf", "accuracy", "GDM-In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
|
38 |
+
task17 = Task("agentharm", "avg_score", "AgentHarm", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
39 |
+
task18 = Task("agentharm_benign", "avg_score", "AgentHarm-Benign", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
40 |
+
task19 = Task("swe_bench", "mean", "SWE-Bench", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench")
|
41 |
|
42 |
|
43 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
63 |
The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
|
64 |
|
65 |
- **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
|
66 |
+
- **Agentic Tasks**: GAIA, GDM-InterCode-CTF, GDM-In-House-CTF, AgentHarm, AgentHarm-Benign, SWE-Bench
|
67 |
|
68 |
Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
|
69 |
|
|
|
98 |
- **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
|
99 |
- **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
|
100 |
- **GDM-In-House-CTF**: A capture-the-flag style challenge testing a variety of security skills pertaining to basic web application security.
|
101 |
+
- **AgentHarm / AgentHarm-Benign**: A benchmark for measuring harmfulness of LLM agents.
|
102 |
+
- **SWE-Bench**: A benchmark for testing the ability of AI agents to solve software engineering tasks.
|
103 |
|
104 |
---
|
105 |
"""
|