Update tasks
Browse files- data/results.json +10 -10
- data/tasks.json +3 -3
data/results.json
CHANGED
@@ -105,43 +105,43 @@
|
|
105 |
},
|
106 |
"gpqa_diamond": {
|
107 |
"accuracy": 0.4318181818181818,
|
108 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
|
109 |
},
|
110 |
"winogrande": {
|
111 |
"accuracy": 0.8666140489344909,
|
112 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
|
113 |
},
|
114 |
"gsm8k": {
|
115 |
"accuracy": 0.9469294920394238,
|
116 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
|
117 |
},
|
118 |
"math": {
|
119 |
"accuracy": 0.6004,
|
120 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
|
121 |
},
|
122 |
"ifeval": {
|
123 |
"final_acc": 0.8604907201780166,
|
124 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
|
125 |
},
|
126 |
"arc_challenge": {
|
127 |
"accuracy": 0.9445392491467577,
|
128 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
|
129 |
},
|
130 |
"arc_easy": {
|
131 |
"accuracy": 0.9823232323232324,
|
132 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
|
133 |
},
|
134 |
"mmlu_pro": {
|
135 |
"accuracy": 0.6688829787234043,
|
136 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
|
137 |
},
|
138 |
"humaneval": {
|
139 |
"mean": 0.7865853658536586,
|
140 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
|
141 |
},
|
142 |
"mmlu": {
|
143 |
"accuracy": 0.8033755875231449,
|
144 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
|
145 |
},
|
146 |
"mmmu_multiple_choice": {
|
147 |
"accuracy": null,
|
|
|
105 |
},
|
106 |
"gpqa_diamond": {
|
107 |
"accuracy": 0.4318181818181818,
|
108 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
|
109 |
},
|
110 |
"winogrande": {
|
111 |
"accuracy": 0.8666140489344909,
|
112 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
|
113 |
},
|
114 |
"gsm8k": {
|
115 |
"accuracy": 0.9469294920394238,
|
116 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
|
117 |
},
|
118 |
"math": {
|
119 |
"accuracy": 0.6004,
|
120 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
|
121 |
},
|
122 |
"ifeval": {
|
123 |
"final_acc": 0.8604907201780166,
|
124 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
|
125 |
},
|
126 |
"arc_challenge": {
|
127 |
"accuracy": 0.9445392491467577,
|
128 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
|
129 |
},
|
130 |
"arc_easy": {
|
131 |
"accuracy": 0.9823232323232324,
|
132 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
|
133 |
},
|
134 |
"mmlu_pro": {
|
135 |
"accuracy": 0.6688829787234043,
|
136 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
|
137 |
},
|
138 |
"humaneval": {
|
139 |
"mean": 0.7865853658536586,
|
140 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
|
141 |
},
|
142 |
"mmlu": {
|
143 |
"accuracy": 0.8033755875231449,
|
144 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
|
145 |
},
|
146 |
"mmmu_multiple_choice": {
|
147 |
"accuracy": null,
|
data/tasks.json
CHANGED
@@ -2,14 +2,14 @@
|
|
2 |
"arc_easy": {
|
3 |
"benchmark": "arc_easy",
|
4 |
"metric": "accuracy",
|
5 |
-
"display_name": "ARC-
|
6 |
"type": "base",
|
7 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
8 |
},
|
9 |
"arc_challenge": {
|
10 |
"benchmark": "arc_challenge",
|
11 |
"metric": "accuracy",
|
12 |
-
"display_name": "ARC-
|
13 |
"type": "base",
|
14 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
15 |
},
|
@@ -79,7 +79,7 @@
|
|
79 |
"gpqa_diamond": {
|
80 |
"benchmark": "gpqa_diamond",
|
81 |
"metric": "accuracy",
|
82 |
-
"display_name": "GPQA-
|
83 |
"type": "base",
|
84 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
|
85 |
},
|
|
|
2 |
"arc_easy": {
|
3 |
"benchmark": "arc_easy",
|
4 |
"metric": "accuracy",
|
5 |
+
"display_name": "ARC-E",
|
6 |
"type": "base",
|
7 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
8 |
},
|
9 |
"arc_challenge": {
|
10 |
"benchmark": "arc_challenge",
|
11 |
"metric": "accuracy",
|
12 |
+
"display_name": "ARC-C",
|
13 |
"type": "base",
|
14 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
15 |
},
|
|
|
79 |
"gpqa_diamond": {
|
80 |
"benchmark": "gpqa_diamond",
|
81 |
"metric": "accuracy",
|
82 |
+
"display_name": "GPQA-D",
|
83 |
"type": "base",
|
84 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
|
85 |
},
|