jwilles commited on
Commit
2b55816
·
1 Parent(s): ec5a2f9

Update tasks

Browse files
Files changed (2) hide show
  1. data/results.json +10 -10
  2. data/tasks.json +3 -3
data/results.json CHANGED
@@ -105,43 +105,43 @@
105
  },
106
  "gpqa_diamond": {
107
  "accuracy": 0.4318181818181818,
108
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
109
  },
110
  "winogrande": {
111
  "accuracy": 0.8666140489344909,
112
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
113
  },
114
  "gsm8k": {
115
  "accuracy": 0.9469294920394238,
116
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
117
  },
118
  "math": {
119
  "accuracy": 0.6004,
120
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
121
  },
122
  "ifeval": {
123
  "final_acc": 0.8604907201780166,
124
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
125
  },
126
  "arc_challenge": {
127
  "accuracy": 0.9445392491467577,
128
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
129
  },
130
  "arc_easy": {
131
  "accuracy": 0.9823232323232324,
132
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
133
  },
134
  "mmlu_pro": {
135
  "accuracy": 0.6688829787234043,
136
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
137
  },
138
  "humaneval": {
139
  "mean": 0.7865853658536586,
140
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
141
  },
142
  "mmlu": {
143
  "accuracy": 0.8033755875231449,
144
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
145
  },
146
  "mmmu_multiple_choice": {
147
  "accuracy": null,
 
105
  },
106
  "gpqa_diamond": {
107
  "accuracy": 0.4318181818181818,
108
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
109
  },
110
  "winogrande": {
111
  "accuracy": 0.8666140489344909,
112
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
113
  },
114
  "gsm8k": {
115
  "accuracy": 0.9469294920394238,
116
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
117
  },
118
  "math": {
119
  "accuracy": 0.6004,
120
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
121
  },
122
  "ifeval": {
123
  "final_acc": 0.8604907201780166,
124
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
125
  },
126
  "arc_challenge": {
127
  "accuracy": 0.9445392491467577,
128
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
129
  },
130
  "arc_easy": {
131
  "accuracy": 0.9823232323232324,
132
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
133
  },
134
  "mmlu_pro": {
135
  "accuracy": 0.6688829787234043,
136
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
137
  },
138
  "humaneval": {
139
  "mean": 0.7865853658536586,
140
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
141
  },
142
  "mmlu": {
143
  "accuracy": 0.8033755875231449,
144
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
145
  },
146
  "mmmu_multiple_choice": {
147
  "accuracy": null,
data/tasks.json CHANGED
@@ -2,14 +2,14 @@
2
  "arc_easy": {
3
  "benchmark": "arc_easy",
4
  "metric": "accuracy",
5
- "display_name": "ARC-Easy",
6
  "type": "base",
7
  "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
8
  },
9
  "arc_challenge": {
10
  "benchmark": "arc_challenge",
11
  "metric": "accuracy",
12
- "display_name": "ARC-Challenge",
13
  "type": "base",
14
  "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
15
  },
@@ -79,7 +79,7 @@
79
  "gpqa_diamond": {
80
  "benchmark": "gpqa_diamond",
81
  "metric": "accuracy",
82
- "display_name": "GPQA-Diamond",
83
  "type": "base",
84
  "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
85
  },
 
2
  "arc_easy": {
3
  "benchmark": "arc_easy",
4
  "metric": "accuracy",
5
+ "display_name": "ARC-E",
6
  "type": "base",
7
  "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
8
  },
9
  "arc_challenge": {
10
  "benchmark": "arc_challenge",
11
  "metric": "accuracy",
12
+ "display_name": "ARC-C",
13
  "type": "base",
14
  "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
15
  },
 
79
  "gpqa_diamond": {
80
  "benchmark": "gpqa_diamond",
81
  "metric": "accuracy",
82
+ "display_name": "GPQA-D",
83
  "type": "base",
84
  "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
85
  },