Adding Evaluation Results

#66
Files changed (1) hide show
  1. README.md +165 -78
README.md CHANGED
@@ -1,34 +1,32 @@
1
  ---
 
 
 
2
  tags:
3
  - generated_from_trainer
4
- license: mit
5
  datasets:
6
  - HuggingFaceH4/ultrachat_200k
7
  - HuggingFaceH4/ultrafeedback_binarized
8
- language:
9
- - en
10
- base_model: mistralai/Mistral-7B-v0.1
11
  widget:
12
- - example_title: Pirate!
13
- messages:
14
- - role: system
15
- content: You are a pirate chatbot who always responds with Arr!
16
- - role: user
17
- content: "There's a llama on my lawn, how can I get rid of him?"
18
- output:
19
- text: >-
20
- Arr! 'Tis a puzzlin' matter, me hearty! A llama on yer lawn be a rare
21
- sight, but I've got a plan that might help ye get rid of 'im. Ye'll need
22
- to gather some carrots and hay, and then lure the llama away with the
23
- promise of a tasty treat. Once he's gone, ye can clean up yer lawn and
24
- enjoy the peace and quiet once again. But beware, me hearty, for there
25
- may be more llamas where that one came from! Arr!
26
  pipeline_tag: text-generation
27
  model-index:
28
  - name: zephyr-7b-beta
29
  results:
30
- # AI2 Reasoning Challenge (25-Shot)
31
- - task:
32
  type: text-generation
33
  name: Text Generation
34
  dataset:
@@ -39,15 +37,13 @@ model-index:
39
  args:
40
  num_few_shot: 25
41
  metrics:
42
- - type: acc_norm
43
- name: normalized accuracy
44
- value: 62.03071672354948
45
  source:
46
- name: Open LLM Leaderboard
47
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
48
-
49
- # HellaSwag (10-shot)
50
- - task:
51
  type: text-generation
52
  name: Text Generation
53
  dataset:
@@ -57,15 +53,13 @@ model-index:
57
  args:
58
  num_few_shot: 10
59
  metrics:
60
- - type: acc_norm
61
- name: normalized accuracy
62
- value: 84.35570603465445
63
  source:
64
- name: Open LLM Leaderboard
65
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
66
-
67
- # DROP (3-shot)
68
- - task:
69
  type: text-generation
70
  name: Text Generation
71
  dataset:
@@ -75,15 +69,13 @@ model-index:
75
  args:
76
  num_few_shot: 3
77
  metrics:
78
- - type: f1
79
- name: f1 score
80
- value: 9.662437080536909
81
  source:
82
- name: Open LLM Leaderboard
83
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
84
-
85
- # TruthfulQA (0-shot)
86
- - task:
87
  type: text-generation
88
  name: Text Generation
89
  dataset:
@@ -94,14 +86,12 @@ model-index:
94
  args:
95
  num_few_shot: 0
96
  metrics:
97
- - type: mc2
98
- value: 57.44916942762855
99
  source:
100
- name: Open LLM Leaderboard
101
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
102
-
103
- # GSM8k (5-shot)
104
- - task:
105
  type: text-generation
106
  name: Text Generation
107
  dataset:
@@ -112,15 +102,13 @@ model-index:
112
  args:
113
  num_few_shot: 5
114
  metrics:
115
- - type: acc
116
- name: accuracy
117
- value: 12.736921910538287
118
  source:
119
- name: Open LLM Leaderboard
120
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
121
-
122
- # MMLU (5-Shot)
123
- - task:
124
  type: text-generation
125
  name: Text Generation
126
  dataset:
@@ -131,15 +119,13 @@ model-index:
131
  args:
132
  num_few_shot: 5
133
  metrics:
134
- - type: acc
135
- name: accuracy
136
- value: 61.07
137
  source:
138
- name: Open LLM Leaderboard
139
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
140
-
141
- # Winogrande (5-shot)
142
- - task:
143
  type: text-generation
144
  name: Text Generation
145
  dataset:
@@ -150,40 +136,128 @@ model-index:
150
  args:
151
  num_few_shot: 5
152
  metrics:
153
- - type: acc
154
- name: accuracy
155
- value: 77.74269928966061
156
  source:
157
- name: Open LLM Leaderboard
158
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
159
-
160
- # AlpacaEval (taken from model card)
161
- - task:
162
  type: text-generation
163
  name: Text Generation
164
  dataset:
165
  name: AlpacaEval
166
  type: tatsu-lab/alpaca_eval
167
  metrics:
168
- - type: unknown
169
- name: win rate
170
- value: 0.9060
171
  source:
172
  url: https://tatsu-lab.github.io/alpaca_eval/
173
-
174
- # MT-Bench (taken from model card)
175
- - task:
176
  type: text-generation
177
  name: Text Generation
178
  dataset:
179
  name: MT-Bench
180
  type: unknown
181
  metrics:
182
- - type: unknown
183
- name: score
184
- value: 7.34
185
  source:
186
  url: https://huggingface.co/spaces/lmsys/mt-bench
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  ---
188
 
189
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -441,4 +515,17 @@ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-le
441
  | TruthfulQA (0-shot) | 57.45 |
442
  | Winogrande (5-shot) | 77.74 |
443
  | GSM8K (5-shot) | 12.74 |
444
- | DROP (3-shot) | 9.66 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - en
4
+ license: mit
5
  tags:
6
  - generated_from_trainer
7
+ base_model: mistralai/Mistral-7B-v0.1
8
  datasets:
9
  - HuggingFaceH4/ultrachat_200k
10
  - HuggingFaceH4/ultrafeedback_binarized
 
 
 
11
  widget:
12
+ - example_title: Pirate!
13
+ messages:
14
+ - role: system
15
+ content: You are a pirate chatbot who always responds with Arr!
16
+ - role: user
17
+ content: There's a llama on my lawn, how can I get rid of him?
18
+ output:
19
+ text: Arr! 'Tis a puzzlin' matter, me hearty! A llama on yer lawn be a rare sight,
20
+ but I've got a plan that might help ye get rid of 'im. Ye'll need to gather
21
+ some carrots and hay, and then lure the llama away with the promise of a tasty
22
+ treat. Once he's gone, ye can clean up yer lawn and enjoy the peace and quiet
23
+ once again. But beware, me hearty, for there may be more llamas where that one
24
+ came from! Arr!
 
25
  pipeline_tag: text-generation
26
  model-index:
27
  - name: zephyr-7b-beta
28
  results:
29
+ - task:
 
30
  type: text-generation
31
  name: Text Generation
32
  dataset:
 
37
  args:
38
  num_few_shot: 25
39
  metrics:
40
+ - type: acc_norm
41
+ value: 62.03071672354948
42
+ name: normalized accuracy
43
  source:
 
44
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
45
+ name: Open LLM Leaderboard
46
+ - task:
 
47
  type: text-generation
48
  name: Text Generation
49
  dataset:
 
53
  args:
54
  num_few_shot: 10
55
  metrics:
56
+ - type: acc_norm
57
+ value: 84.35570603465445
58
+ name: normalized accuracy
59
  source:
 
60
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
61
+ name: Open LLM Leaderboard
62
+ - task:
 
63
  type: text-generation
64
  name: Text Generation
65
  dataset:
 
69
  args:
70
  num_few_shot: 3
71
  metrics:
72
+ - type: f1
73
+ value: 9.66243708053691
74
+ name: f1 score
75
  source:
 
76
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
77
+ name: Open LLM Leaderboard
78
+ - task:
 
79
  type: text-generation
80
  name: Text Generation
81
  dataset:
 
86
  args:
87
  num_few_shot: 0
88
  metrics:
89
+ - type: mc2
90
+ value: 57.44916942762855
91
  source:
 
92
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
93
+ name: Open LLM Leaderboard
94
+ - task:
 
95
  type: text-generation
96
  name: Text Generation
97
  dataset:
 
102
  args:
103
  num_few_shot: 5
104
  metrics:
105
+ - type: acc
106
+ value: 12.736921910538287
107
+ name: accuracy
108
  source:
 
109
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
110
+ name: Open LLM Leaderboard
111
+ - task:
 
112
  type: text-generation
113
  name: Text Generation
114
  dataset:
 
119
  args:
120
  num_few_shot: 5
121
  metrics:
122
+ - type: acc
123
+ value: 61.07
124
+ name: accuracy
125
  source:
 
126
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
127
+ name: Open LLM Leaderboard
128
+ - task:
 
129
  type: text-generation
130
  name: Text Generation
131
  dataset:
 
136
  args:
137
  num_few_shot: 5
138
  metrics:
139
+ - type: acc
140
+ value: 77.7426992896606
141
+ name: accuracy
142
  source:
 
143
  url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
144
+ name: Open LLM Leaderboard
145
+ - task:
 
146
  type: text-generation
147
  name: Text Generation
148
  dataset:
149
  name: AlpacaEval
150
  type: tatsu-lab/alpaca_eval
151
  metrics:
152
+ - type: unknown
153
+ value: 0.906
154
+ name: win rate
155
  source:
156
  url: https://tatsu-lab.github.io/alpaca_eval/
157
+ - task:
 
 
158
  type: text-generation
159
  name: Text Generation
160
  dataset:
161
  name: MT-Bench
162
  type: unknown
163
  metrics:
164
+ - type: unknown
165
+ value: 7.34
166
+ name: score
167
  source:
168
  url: https://huggingface.co/spaces/lmsys/mt-bench
169
+ - task:
170
+ type: text-generation
171
+ name: Text Generation
172
+ dataset:
173
+ name: IFEval (0-Shot)
174
+ type: HuggingFaceH4/ifeval
175
+ args:
176
+ num_few_shot: 0
177
+ metrics:
178
+ - type: inst_level_strict_acc and prompt_level_strict_acc
179
+ value: 49.5
180
+ name: strict accuracy
181
+ source:
182
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
183
+ name: Open LLM Leaderboard
184
+ - task:
185
+ type: text-generation
186
+ name: Text Generation
187
+ dataset:
188
+ name: BBH (3-Shot)
189
+ type: BBH
190
+ args:
191
+ num_few_shot: 3
192
+ metrics:
193
+ - type: acc_norm
194
+ value: 21.49
195
+ name: normalized accuracy
196
+ source:
197
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
198
+ name: Open LLM Leaderboard
199
+ - task:
200
+ type: text-generation
201
+ name: Text Generation
202
+ dataset:
203
+ name: MATH Lvl 5 (4-Shot)
204
+ type: hendrycks/competition_math
205
+ args:
206
+ num_few_shot: 4
207
+ metrics:
208
+ - type: exact_match
209
+ value: 2.72
210
+ name: exact match
211
+ source:
212
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
213
+ name: Open LLM Leaderboard
214
+ - task:
215
+ type: text-generation
216
+ name: Text Generation
217
+ dataset:
218
+ name: GPQA (0-shot)
219
+ type: Idavidrein/gpqa
220
+ args:
221
+ num_few_shot: 0
222
+ metrics:
223
+ - type: acc_norm
224
+ value: 5.37
225
+ name: acc_norm
226
+ source:
227
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
228
+ name: Open LLM Leaderboard
229
+ - task:
230
+ type: text-generation
231
+ name: Text Generation
232
+ dataset:
233
+ name: MuSR (0-shot)
234
+ type: TAUR-Lab/MuSR
235
+ args:
236
+ num_few_shot: 0
237
+ metrics:
238
+ - type: acc_norm
239
+ value: 7.73
240
+ name: acc_norm
241
+ source:
242
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
243
+ name: Open LLM Leaderboard
244
+ - task:
245
+ type: text-generation
246
+ name: Text Generation
247
+ dataset:
248
+ name: MMLU-PRO (5-shot)
249
+ type: TIGER-Lab/MMLU-Pro
250
+ config: main
251
+ split: test
252
+ args:
253
+ num_few_shot: 5
254
+ metrics:
255
+ - type: acc
256
+ value: 19.79
257
+ name: accuracy
258
+ source:
259
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
260
+ name: Open LLM Leaderboard
261
  ---
262
 
263
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
515
  | TruthfulQA (0-shot) | 57.45 |
516
  | Winogrande (5-shot) | 77.74 |
517
  | GSM8K (5-shot) | 12.74 |
518
+ | DROP (3-shot) | 9.66 |
519
+ # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
520
+ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/HuggingFaceH4__zephyr-7b-beta-details)
521
+
522
+ | Metric |Value|
523
+ |-------------------|----:|
524
+ |Avg. |17.77|
525
+ |IFEval (0-Shot) |49.50|
526
+ |BBH (3-Shot) |21.49|
527
+ |MATH Lvl 5 (4-Shot)| 2.72|
528
+ |GPQA (0-shot) | 5.37|
529
+ |MuSR (0-shot) | 7.73|
530
+ |MMLU-PRO (5-shot) |19.79|
531
+