xeon27
commited on
Commit
·
5458f38
1
Parent(s):
1f6d554
Add o3-mini and DeepSeek-R1 results
Browse files- inspect_log_file_names.json +25 -0
- refactor_eval_results.py +5 -0
inspect_log_file_names.json
CHANGED
@@ -37,6 +37,31 @@
|
|
37 |
"mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
|
38 |
"mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
|
39 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
"o1": {
|
41 |
"winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
|
42 |
"humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
|
|
|
37 |
"mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
|
38 |
"mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
|
39 |
},
|
40 |
+
"o3-mini": {
|
41 |
+
"math": "2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.json",
|
42 |
+
"humaneval": "2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.json",
|
43 |
+
"mmlu_pro": "2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.json",
|
44 |
+
"gpqa_diamond": "2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.json",
|
45 |
+
"winogrande": "2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.json",
|
46 |
+
"gsm8k": "2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.json",
|
47 |
+
"arc_challenge": "2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.json",
|
48 |
+
"arc_easy": "2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.json",
|
49 |
+
"gaia": "2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.json",
|
50 |
+
"gdm_intercode_ctf": "2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.json",
|
51 |
+
"gdm_in_house_ctf": "2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.json",
|
52 |
+
"agentharm_benign": "2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.json",
|
53 |
+
"agentharm": "2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.json",
|
54 |
+
"swe_bench": "2025-02-03T06-49-09+00-00_openai-o3-mini.json"
|
55 |
+
},
|
56 |
+
"DeepSeek-R1": {
|
57 |
+
"mmlu_pro": "2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.json",
|
58 |
+
"math": "2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.json",
|
59 |
+
"gsm8k": "2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.json",
|
60 |
+
"arc_challenge": "2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.json",
|
61 |
+
"winogrande": "2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.json",
|
62 |
+
"arc_easy": "2025-01-30T12-48-35-05-00_arc-easy_SvRDfqsHDECQtvNU7rodZH.json",
|
63 |
+
"gpqa_diamond": "2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.json"
|
64 |
+
},
|
65 |
"o1": {
|
66 |
"winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
|
67 |
"humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
|
refactor_eval_results.py
CHANGED
@@ -42,6 +42,8 @@ MODEL_SHA_MAP = {
|
|
42 |
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
|
43 |
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
44 |
"o1": "https://openai.com/o1",
|
|
|
|
|
45 |
}
|
46 |
|
47 |
MODEL_VERSION_MAP = {
|
@@ -58,6 +60,8 @@ MODEL_VERSION_MAP = {
|
|
58 |
"gpt-4o": "GPT-4o-20240806",
|
59 |
"gpt-4o-mini": "GPT-4o-mini-20240718",
|
60 |
"o1": "o1-20241217",
|
|
|
|
|
61 |
}
|
62 |
|
63 |
AGENTIC_LOG_MODEL_NAME_MAP = {
|
@@ -65,6 +69,7 @@ AGENTIC_LOG_MODEL_NAME_MAP = {
|
|
65 |
"gemini-1.5-pro": "gemini-1.5-pro-002",
|
66 |
"gpt-4o": "gpt-4o-2024-08-06",
|
67 |
"o1": "o1-2024-12-17",
|
|
|
68 |
}
|
69 |
|
70 |
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
|
|
|
42 |
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
|
43 |
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
44 |
"o1": "https://openai.com/o1",
|
45 |
+
"o3-mini": "https://openai.com/index/openai-o3-mini",
|
46 |
+
"DeepSeek-R1": "https://api-docs.deepseek.com/news/news250120"
|
47 |
}
|
48 |
|
49 |
MODEL_VERSION_MAP = {
|
|
|
60 |
"gpt-4o": "GPT-4o-20240806",
|
61 |
"gpt-4o-mini": "GPT-4o-mini-20240718",
|
62 |
"o1": "o1-20241217",
|
63 |
+
"o3-mini": "o3-mini-20250131",
|
64 |
+
"DeepSeek-R1": "DeepSeek-R1",
|
65 |
}
|
66 |
|
67 |
AGENTIC_LOG_MODEL_NAME_MAP = {
|
|
|
69 |
"gemini-1.5-pro": "gemini-1.5-pro-002",
|
70 |
"gpt-4o": "gpt-4o-2024-08-06",
|
71 |
"o1": "o1-2024-12-17",
|
72 |
+
"o3-mini": "o3-mini-2025-01-31",
|
73 |
}
|
74 |
|
75 |
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
|