xeon27 commited on
Commit
5458f38
·
1 Parent(s): 1f6d554

Add o3-mini and DeepSeek-R1 results

Browse files
inspect_log_file_names.json CHANGED
@@ -37,6 +37,31 @@
37
  "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
38
  "mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "o1": {
41
  "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
42
  "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
 
37
  "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
38
  "mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
39
  },
40
+ "o3-mini": {
41
+ "math": "2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.json",
42
+ "humaneval": "2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.json",
43
+ "mmlu_pro": "2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.json",
44
+ "gpqa_diamond": "2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.json",
45
+ "winogrande": "2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.json",
46
+ "gsm8k": "2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.json",
47
+ "arc_challenge": "2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.json",
48
+ "arc_easy": "2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.json",
49
+ "gaia": "2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.json",
50
+ "gdm_intercode_ctf": "2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.json",
51
+ "gdm_in_house_ctf": "2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.json",
52
+ "agentharm_benign": "2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.json",
53
+ "agentharm": "2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.json",
54
+ "swe_bench": "2025-02-03T06-49-09+00-00_openai-o3-mini.json"
55
+ },
56
+ "DeepSeek-R1": {
57
+ "mmlu_pro": "2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.json",
58
+ "math": "2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.json",
59
+ "gsm8k": "2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.json",
60
+ "arc_challenge": "2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.json",
61
+ "winogrande": "2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.json",
62
+ "arc_easy": "2025-01-30T12-48-35-05-00_arc-easy_SvRDfqsHDECQtvNU7rodZH.json",
63
+ "gpqa_diamond": "2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.json"
64
+ },
65
  "o1": {
66
  "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
67
  "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
refactor_eval_results.py CHANGED
@@ -42,6 +42,8 @@ MODEL_SHA_MAP = {
42
  "gpt-4o": "https://openai.com/index/hello-gpt-4o",
43
  "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
44
  "o1": "https://openai.com/o1",
 
 
45
  }
46
 
47
  MODEL_VERSION_MAP = {
@@ -58,6 +60,8 @@ MODEL_VERSION_MAP = {
58
  "gpt-4o": "GPT-4o-20240806",
59
  "gpt-4o-mini": "GPT-4o-mini-20240718",
60
  "o1": "o1-20241217",
 
 
61
  }
62
 
63
  AGENTIC_LOG_MODEL_NAME_MAP = {
@@ -65,6 +69,7 @@ AGENTIC_LOG_MODEL_NAME_MAP = {
65
  "gemini-1.5-pro": "gemini-1.5-pro-002",
66
  "gpt-4o": "gpt-4o-2024-08-06",
67
  "o1": "o1-2024-12-17",
 
68
  }
69
 
70
  AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
 
42
  "gpt-4o": "https://openai.com/index/hello-gpt-4o",
43
  "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
44
  "o1": "https://openai.com/o1",
45
+ "o3-mini": "https://openai.com/index/openai-o3-mini",
46
+ "DeepSeek-R1": "https://api-docs.deepseek.com/news/news250120"
47
  }
48
 
49
  MODEL_VERSION_MAP = {
 
60
  "gpt-4o": "GPT-4o-20240806",
61
  "gpt-4o-mini": "GPT-4o-mini-20240718",
62
  "o1": "o1-20241217",
63
+ "o3-mini": "o3-mini-20250131",
64
+ "DeepSeek-R1": "DeepSeek-R1",
65
  }
66
 
67
  AGENTIC_LOG_MODEL_NAME_MAP = {
 
69
  "gemini-1.5-pro": "gemini-1.5-pro-002",
70
  "gpt-4o": "gpt-4o-2024-08-06",
71
  "o1": "o1-2024-12-17",
72
+ "o3-mini": "o3-mini-2025-01-31",
73
  }
74
 
75
  AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]