amezasor commited on
Commit
02e3249
1 Parent(s): be4e652

update: eval results

Browse files
Files changed (1) hide show
  1. README.md +69 -29
README.md CHANGED
@@ -12,18 +12,18 @@ model-index:
12
  - task:
13
  type: text-generation
14
  dataset:
15
- type: human-exams
16
- name: MMLU
17
  metrics:
18
  - name: pass@1
19
  type: pass@1
20
- value:
21
  veriefied: false
22
  - task:
23
  type: text-generation
24
  dataset:
25
- type: human-exams
26
- name: MMLU-Pro
27
  metrics:
28
  - name: pass@1
29
  type: pass@1
@@ -37,17 +37,27 @@ model-index:
37
  metrics:
38
  - name: pass@1
39
  type: pass@1
40
- value:
41
  veriefied: false
42
  - task:
43
  type: text-generation
44
  dataset:
45
- type: commonsense
46
- name: WinoGrande
47
  metrics:
48
  - name: pass@1
49
  type: pass@1
50
- value:
 
 
 
 
 
 
 
 
 
 
51
  veriefied: false
52
  - task:
53
  type: text-generation
@@ -57,7 +67,7 @@ model-index:
57
  metrics:
58
  - name: pass@1
59
  type: pass@1
60
- value:
61
  veriefied: false
62
  - task:
63
  type: text-generation
@@ -67,27 +77,27 @@ model-index:
67
  metrics:
68
  - name: pass@1
69
  type: pass@1
70
- value:
71
  veriefied: false
72
  - task:
73
  type: text-generation
74
  dataset:
75
  type: commonsense
76
- name: PIQA
77
  metrics:
78
  - name: pass@1
79
  type: pass@1
80
- value:
81
  veriefied: false
82
  - task:
83
  type: text-generation
84
  dataset:
85
  type: commonsense
86
- name: Hellaswag
87
  metrics:
88
  - name: pass@1
89
  type: pass@1
90
- value:
91
  veriefied: false
92
  - task:
93
  type: text-generation
@@ -97,7 +107,7 @@ model-index:
97
  metrics:
98
  - name: pass@1
99
  type: pass@1
100
- value:
101
  veriefied: false
102
  - task:
103
  type: text-generation
@@ -107,17 +117,17 @@ model-index:
107
  metrics:
108
  - name: pass@1
109
  type: pass@1
110
- value:
111
  veriefied: false
112
  - task:
113
  type: text-generation
114
  dataset:
115
  type: reading-comprehension
116
- name: SQuAD v2
117
  metrics:
118
  - name: pass@1
119
  type: pass@1
120
- value:
121
  veriefied: false
122
  - task:
123
  type: text-generation
@@ -127,7 +137,7 @@ model-index:
127
  metrics:
128
  - name: pass@1
129
  type: pass@1
130
- value:
131
  veriefied: false
132
  - task:
133
  type: text-generation
@@ -137,7 +147,7 @@ model-index:
137
  metrics:
138
  - name: pass@1
139
  type: pass@1
140
- value:
141
  veriefied: false
142
  - task:
143
  type: text-generation
@@ -147,17 +157,37 @@ model-index:
147
  metrics:
148
  - name: pass@1
149
  type: pass@1
150
- value:
151
  veriefied: false
152
  - task:
153
  type: text-generation
154
  dataset:
155
  type: code
156
- name: HumanEval
157
  metrics:
158
  - name: pass@1
159
  type: pass@1
160
- value:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  veriefied: false
162
  - task:
163
  type: text-generation
@@ -167,7 +197,7 @@ model-index:
167
  metrics:
168
  - name: pass@1
169
  type: pass@1
170
- value:
171
  veriefied: false
172
  - task:
173
  type: text-generation
@@ -177,7 +207,7 @@ model-index:
177
  metrics:
178
  - name: pass@1
179
  type: pass@1
180
- value:
181
  veriefied: false
182
  - task:
183
  type: text-generation
@@ -187,17 +217,27 @@ model-index:
187
  metrics:
188
  - name: pass@1
189
  type: pass@1
190
- value:
191
  veriefied: false
192
  - task:
193
  type: text-generation
194
  dataset:
195
  type: multilingual
196
- name: MGSM
197
  metrics:
198
  - name: pass@1
199
  type: pass@1
200
- value:
 
 
 
 
 
 
 
 
 
 
201
  veriefied: false
202
  ---
203
 
 
12
  - task:
13
  type: text-generation
14
  dataset:
15
+ type: instruction-following
16
+ name: IFEval
17
  metrics:
18
  - name: pass@1
19
  type: pass@1
20
+ value: 46.07
21
  veriefied: false
22
  - task:
23
  type: text-generation
24
  dataset:
25
+ type: instruction-following
26
+ name: MT-Bench
27
  metrics:
28
  - name: pass@1
29
  type: pass@1
 
37
  metrics:
38
  - name: pass@1
39
  type: pass@1
40
+ value: 29.75
41
  veriefied: false
42
  - task:
43
  type: text-generation
44
  dataset:
45
+ type: human-exams
46
+ name: MMLU
47
  metrics:
48
  - name: pass@1
49
  type: pass@1
50
+ value: 56.03
51
+ veriefied: false
52
+ - task:
53
+ type: text-generation
54
+ dataset:
55
+ type: human-exams
56
+ name: MMLU-Pro
57
+ metrics:
58
+ - name: pass@1
59
+ type: pass@1
60
+ value: 27.92
61
  veriefied: false
62
  - task:
63
  type: text-generation
 
67
  metrics:
68
  - name: pass@1
69
  type: pass@1
70
+ value: 43.20
71
  veriefied: false
72
  - task:
73
  type: text-generation
 
77
  metrics:
78
  - name: pass@1
79
  type: pass@1
80
+ value: 66.36
81
  veriefied: false
82
  - task:
83
  type: text-generation
84
  dataset:
85
  type: commonsense
86
+ name: Hellaswag
87
  metrics:
88
  - name: pass@1
89
  type: pass@1
90
+ value: 76.79
91
  veriefied: false
92
  - task:
93
  type: text-generation
94
  dataset:
95
  type: commonsense
96
+ name: WinoGrande
97
  metrics:
98
  - name: pass@1
99
  type: pass@1
100
+ value: 71.90
101
  veriefied: false
102
  - task:
103
  type: text-generation
 
107
  metrics:
108
  - name: pass@1
109
  type: pass@1
110
+ value: 53.37
111
  veriefied: false
112
  - task:
113
  type: text-generation
 
117
  metrics:
118
  - name: pass@1
119
  type: pass@1
120
+ value: 84.89
121
  veriefied: false
122
  - task:
123
  type: text-generation
124
  dataset:
125
  type: reading-comprehension
126
+ name: SQuAD 2.0
127
  metrics:
128
  - name: pass@1
129
  type: pass@1
130
+ value: 19.73
131
  veriefied: false
132
  - task:
133
  type: text-generation
 
137
  metrics:
138
  - name: pass@1
139
  type: pass@1
140
+ value: 54.35
141
  veriefied: false
142
  - task:
143
  type: text-generation
 
147
  metrics:
148
  - name: pass@1
149
  type: pass@1
150
+ value: 28.61
151
  veriefied: false
152
  - task:
153
  type: text-generation
 
157
  metrics:
158
  - name: pass@1
159
  type: pass@1
160
+ value: 43.74
161
  veriefied: false
162
  - task:
163
  type: text-generation
164
  dataset:
165
  type: code
166
+ name: HumanEvalSynthesis
167
  metrics:
168
  - name: pass@1
169
  type: pass@1
170
+ value: 50.61
171
+ veriefied: false
172
+ - task:
173
+ type: text-generation
174
+ dataset:
175
+ type: code
176
+ name: HumanEvalExplain
177
+ metrics:
178
+ - name: pass@1
179
+ type: pass@1
180
+ value: 45.58
181
+ veriefied: false
182
+ - task:
183
+ type: text-generation
184
+ dataset:
185
+ type: code
186
+ name: HumanEvalFix
187
+ metrics:
188
+ - name: pass@1
189
+ type: pass@1
190
+ value: 51.83
191
  veriefied: false
192
  - task:
193
  type: text-generation
 
197
  metrics:
198
  - name: pass@1
199
  type: pass@1
200
+ value: 41.00
201
  veriefied: false
202
  - task:
203
  type: text-generation
 
207
  metrics:
208
  - name: pass@1
209
  type: pass@1
210
+ value: 59.66
211
  veriefied: false
212
  - task:
213
  type: text-generation
 
217
  metrics:
218
  - name: pass@1
219
  type: pass@1
220
+ value: 23.66
221
  veriefied: false
222
  - task:
223
  type: text-generation
224
  dataset:
225
  type: multilingual
226
+ name: PAWS-X (7 langs)
227
  metrics:
228
  - name: pass@1
229
  type: pass@1
230
+ value: 61.42
231
+ veriefied: false
232
+ - task:
233
+ type: text-generation
234
+ dataset:
235
+ type: multilingual
236
+ name: MGSM (6 langs)
237
+ metrics:
238
+ - name: pass@1
239
+ type: pass@1
240
+ value: 37.13
241
  veriefied: false
242
  ---
243