alexmarques commited on
Commit
cc881d0
1 Parent(s): 816f7dd

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -16
README.md CHANGED
@@ -179,9 +179,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
179
  <tr>
180
  <td>MMLU (5-shot)
181
  </td>
182
- <td>83.94
183
  </td>
184
- <td>83.71
185
  </td>
186
  <td>99.7%
187
  </td>
@@ -189,9 +189,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
189
  <tr>
190
  <td>MMLU (CoT, 0-shot)
191
  </td>
192
- <td>86.23
193
  </td>
194
- <td>85.81
195
  </td>
196
  <td>99.5%
197
  </td>
@@ -199,9 +199,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
199
  <tr>
200
  <td>ARC Challenge (0-shot)
201
  </td>
202
- <td>93.34
203
  </td>
204
- <td>93.09
205
  </td>
206
  <td>99.7%
207
  </td>
@@ -209,9 +209,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
209
  <tr>
210
  <td>GSM-8K (CoT, 8-shot, strict-match)
211
  </td>
212
- <td>95.38
213
  </td>
214
- <td>94.24
215
  </td>
216
  <td>98.8%
217
  </td>
@@ -219,9 +219,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
219
  <tr>
220
  <td>Hellaswag (10-shot)
221
  </td>
222
- <td>86.66
223
  </td>
224
- <td>86.65
225
  </td>
226
  <td>100.0%
227
  </td>
@@ -229,9 +229,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
229
  <tr>
230
  <td>Winogrande (5-shot)
231
  </td>
232
- <td>85.32
233
  </td>
234
- <td>85.10
235
  </td>
236
  <td>100.1%
237
  </td>
@@ -239,9 +239,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
239
  <tr>
240
  <td>TruthfulQA (0-shot, mc2)
241
  </td>
242
- <td>60.65
243
  </td>
244
- <td>61.43
245
  </td>
246
  <td>101.3%
247
  </td>
@@ -249,9 +249,9 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
249
  <tr>
250
  <td><strong>Average</strong>
251
  </td>
252
- <td><strong>84.50</strong>
253
  </td>
254
- <td><strong>84.33</strong>
255
  </td>
256
  <td><strong>99.9%</strong>
257
  </td>
 
179
  <tr>
180
  <td>MMLU (5-shot)
181
  </td>
182
+ <td>83.9
183
  </td>
184
+ <td>83.7
185
  </td>
186
  <td>99.7%
187
  </td>
 
189
  <tr>
190
  <td>MMLU (CoT, 0-shot)
191
  </td>
192
+ <td>86.2
193
  </td>
194
+ <td>85.8
195
  </td>
196
  <td>99.5%
197
  </td>
 
199
  <tr>
200
  <td>ARC Challenge (0-shot)
201
  </td>
202
+ <td>93.3
203
  </td>
204
+ <td>93.1
205
  </td>
206
  <td>99.7%
207
  </td>
 
209
  <tr>
210
  <td>GSM-8K (CoT, 8-shot, strict-match)
211
  </td>
212
+ <td>95.4
213
  </td>
214
+ <td>94.2
215
  </td>
216
  <td>98.8%
217
  </td>
 
219
  <tr>
220
  <td>Hellaswag (10-shot)
221
  </td>
222
+ <td>86.7
223
  </td>
224
+ <td>86.7
225
  </td>
226
  <td>100.0%
227
  </td>
 
229
  <tr>
230
  <td>Winogrande (5-shot)
231
  </td>
232
+ <td>85.3
233
  </td>
234
+ <td>85.1
235
  </td>
236
  <td>100.1%
237
  </td>
 
239
  <tr>
240
  <td>TruthfulQA (0-shot, mc2)
241
  </td>
242
+ <td>60.7
243
  </td>
244
+ <td>61.4
245
  </td>
246
  <td>101.3%
247
  </td>
 
249
  <tr>
250
  <td><strong>Average</strong>
251
  </td>
252
+ <td><strong>84.5</strong>
253
  </td>
254
+ <td><strong>84.3</strong>
255
  </td>
256
  <td><strong>99.9%</strong>
257
  </td>