slimfrikha-tii commited on
Commit
15015d2
1 Parent(s): e37f587

docs(readme): benchs

Browse files
Files changed (1) hide show
  1. README.md +74 -72
README.md CHANGED
@@ -23,6 +23,7 @@ Falcon3-7B-Instruct supports 4 languages (english, french, spanish, portuguese)
23
  - Grouped query attention (GQA) for faster inference: 12 query heads and 4 KV heads
24
  - Wider head dimension: 256
25
  - High RoPE value to support long context understanding: 1000042
 
26
  - 32k context length
27
  - 131k vocab size
28
  - Pretrained on 14 Gigatokens of datasets comprising of web, code, STEM, high quality and mutlilingual data using 2048 H100 GPU chips
@@ -49,7 +50,7 @@ model_name = "tiiuae/Falcon3-7B-Instruct"
49
  model = AutoModelForCausalLM.from_pretrained(
50
  model_name,
51
  torch_dtype="auto",
52
- device_map="auto"
53
  )
54
  tokenizer = AutoTokenizer.from_pretrained(model_name)
55
 
@@ -90,8 +91,6 @@ We report in the following table our internal pipeline benchmarks:
90
  <col style="width: 10%;">
91
  <col style="width: 7%;">
92
  <col style="width: 7%;">
93
- <col style="width: 7%;">
94
- <col style="width: 7%;">
95
  <col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
96
  </colgroup>
97
  <thead>
@@ -99,9 +98,7 @@ We report in the following table our internal pipeline benchmarks:
99
  <th>Category</th>
100
  <th>Benchmark</th>
101
  <th>Llama-3.1-8B-Instruct</th>
102
- <th>Qwen2-7B-Instruct</th>
103
  <th>Qwen2.5-7B-Instruct</th>
104
- <th>gemma-2-9b-it</th>
105
  <th>Falcon3-7B-Instruct</th>
106
  </tr>
107
  </thead>
@@ -109,110 +106,115 @@ We report in the following table our internal pipeline benchmarks:
109
  <tr>
110
  <td rowspan="3">General</td>
111
  <td>MMLU (5-shot)</td>
112
- <td>-</td>
113
- <td>-</td>
114
- <td>-</td>
115
- <td>-</td>
116
- <td>-</td>
117
  </tr>
118
  <tr>
119
  <td>MMLU-PRO (5-shot)</td>
120
- <td>-</td>
121
- <td>-</td>
122
- <td>-</td>
123
- <td>-</td>
124
- <td>-</td>
125
  </tr>
126
  <tr>
127
  <td>IFEval</td>
128
- <td>-</td>
129
- <td>-</td>
130
- <td>-</td>
131
- <td>-</td>
132
- <td>-</td>
133
  </tr>
134
  <tr>
135
- <td rowspan="2">Math</td>
136
  <td>GSM8K (5-shot)</td>
137
- <td>-</td>
138
- <td>-</td>
139
- <td>-</td>
140
- <td>-</td>
141
- <td>-</td>
 
 
 
 
142
  </tr>
143
  <tr>
144
  <td>MATH(4-shot)</td>
145
- <td>-</td>
146
- <td>-</td>
147
- <td>-</td>
148
- <td>-</td>
149
- <td>-</td>
150
  </tr>
151
  <tr>
152
- <td rowspan="4">Reasoning</td>
153
  <td>Arc Challenge (25-shot)</td>
154
- <td>-</td>
155
- <td>-</td>
156
- <td>-</td>
157
- <td>-</td>
158
- <td>-</td>
159
  </tr>
160
  <tr>
161
  <td>GPQA (0-shot)</td>
162
- <td>-</td>
163
- <td>-</td>
164
- <td>-</td>
165
- <td>-</td>
166
- <td>-</td>
 
 
 
 
167
  </tr>
168
  <tr>
169
  <td>MUSR (0-shot)</td>
170
- <td>-</td>
171
- <td>-</td>
172
- <td>-</td>
173
- <td>-</td>
174
- <td>-</td>
175
  </tr>
176
  <tr>
177
  <td>BBH (3-shot)</td>
178
- <td>-</td>
179
- <td>-</td>
180
- <td>-</td>
181
- <td>-</td>
182
- <td>-</td>
 
 
 
 
183
  </tr>
184
  <tr>
185
  <td rowspan="4">CommonSense Understanding</td>
186
  <td>PIQA (0-shot)</td>
187
- <td>-</td>
188
- <td>-</td>
189
- <td>-</td>
190
- <td>-</td>
191
- <td>-</td>
192
  </tr>
193
  <tr>
194
  <td>SciQ (0-shot)</td>
195
- <td>-</td>
196
- <td>-</td>
197
- <td>-</td>
198
- <td>-</td>
199
- <td>-</td>
200
  </tr>
201
  <tr>
202
  <td>Winogrande (0-shot)</td>
203
- <td>-</td>
204
- <td>-</td>
205
- <td>-</td>
206
- <td>-</td>
207
- <td>-</td>
208
  </tr>
209
  <tr>
210
  <td>OpenbookQA (0-shot)</td>
211
- <td>-</td>
212
- <td>-</td>
213
- <td>-</td>
214
- <td>-</td>
215
- <td>-</td>
 
 
 
 
 
 
 
 
 
 
 
216
  </tr>
217
  </tbody>
218
  </table>
 
23
  - Grouped query attention (GQA) for faster inference: 12 query heads and 4 KV heads
24
  - Wider head dimension: 256
25
  - High RoPE value to support long context understanding: 1000042
26
+ - Uses SwiGLU and RMSNorm
27
  - 32k context length
28
  - 131k vocab size
29
  - Pretrained on 14 Gigatokens of datasets comprising of web, code, STEM, high quality and mutlilingual data using 2048 H100 GPU chips
 
50
  model = AutoModelForCausalLM.from_pretrained(
51
  model_name,
52
  torch_dtype="auto",
53
+ device_map="auto"]
54
  )
55
  tokenizer = AutoTokenizer.from_pretrained(model_name)
56
 
 
91
  <col style="width: 10%;">
92
  <col style="width: 7%;">
93
  <col style="width: 7%;">
 
 
94
  <col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
95
  </colgroup>
96
  <thead>
 
98
  <th>Category</th>
99
  <th>Benchmark</th>
100
  <th>Llama-3.1-8B-Instruct</th>
 
101
  <th>Qwen2.5-7B-Instruct</th>
 
102
  <th>Falcon3-7B-Instruct</th>
103
  </tr>
104
  </thead>
 
106
  <tr>
107
  <td rowspan="3">General</td>
108
  <td>MMLU (5-shot)</td>
109
+ <td>55.9</td>
110
+ <td><b>72.4</b></td>
111
+ <td>68</td>
 
 
112
  </tr>
113
  <tr>
114
  <td>MMLU-PRO (5-shot)</td>
115
+ <td>21.8</td>
116
+ <td>35.8</td>
117
+ <td><b>40.7</b></td>
 
 
118
  </tr>
119
  <tr>
120
  <td>IFEval</td>
121
+ <td><b>78.8</b></td>
122
+ <td>74.7</td>
123
+ <td>76.5</td>
 
 
124
  </tr>
125
  <tr>
126
+ <td rowspan="3">Math</td>
127
  <td>GSM8K (5-shot)</td>
128
+ <td>19.2</td>
129
+ <td>33.7</td>
130
+ <td><b>78.8</b></td>
131
+ </tr>
132
+ <tr>
133
+ <td>GSM8k (8-shot, COT)</td>
134
+ <td>79.8</td>
135
+ <td>72.7</td>
136
+ <td><b>80.9</b></td>
137
  </tr>
138
  <tr>
139
  <td>MATH(4-shot)</td>
140
+ <td>10.4</td>
141
+ <td>26</td>
142
+ <td><b>33.1</b></td>
 
 
143
  </tr>
144
  <tr>
145
+ <td rowspan="6">Reasoning</td>
146
  <td>Arc Challenge (25-shot)</td>
147
+ <td>46.6</td>
148
+ <td>55.7</td>
149
+ <td><b>65.9</b></td>
 
 
150
  </tr>
151
  <tr>
152
  <td>GPQA (0-shot)</td>
153
+ <td><b>33.6</b></td>
154
+ <td>31.9</td>
155
+ <td>32</td>
156
+ </tr>
157
+ <tr>
158
+ <td>GPQA (0-shot, COT)</td>
159
+ <td>9.6</td>
160
+ <td>13.8</td>
161
+ <td><b>22.3</b></td>
162
  </tr>
163
  <tr>
164
  <td>MUSR (0-shot)</td>
165
+ <td>38.6</td>
166
+ <td>40.7</td>
167
+ <td><b>46.4</b></td>
 
 
168
  </tr>
169
  <tr>
170
  <td>BBH (3-shot)</td>
171
+ <td>43.7</td>
172
+ <td><b>53.9</b></td>
173
+ <td>52.4</td>
174
+ </tr>
175
+ <tr>
176
+ <td>BBH (3-shot, COT)</td>
177
+ <td>6.7</td>
178
+ <td>21.2</td>
179
+ <td><b>69.3</b></td>
180
  </tr>
181
  <tr>
182
  <td rowspan="4">CommonSense Understanding</td>
183
  <td>PIQA (0-shot)</td>
184
+ <td><b>78.9</b></td>
185
+ <td>73.7</td>
186
+ <td>78.8</td>
 
 
187
  </tr>
188
  <tr>
189
  <td>SciQ (0-shot)</td>
190
+ <td>80.2</td>
191
+ <td>50.9</td>
192
+ <td><b>94.7</b></td>
 
 
193
  </tr>
194
  <tr>
195
  <td>Winogrande (0-shot)</td>
196
+ <td>TODO</td>
197
+ <td>TODO</td>
198
+ <td>70.4</td>
 
 
199
  </tr>
200
  <tr>
201
  <td>OpenbookQA (0-shot)</td>
202
+ <td><b>46.2</b></td>
203
+ <td>42.4</td>
204
+ <td>45.8</td>
205
+ </tr>
206
+ <tr>
207
+ <td rowspan="2">Instructions following</td>
208
+ <td>MT-Bench (avg)</td>
209
+ <td>7.86</td>
210
+ <td><b>8.54</b></td>
211
+ <td>8.36</td>
212
+ </tr>
213
+ <tr>
214
+ <td>Alapaca (WC)</td>
215
+ <td>26.57</td>
216
+ <td><b>31.5</b></td>
217
+ <td>26.13</td>
218
  </tr>
219
  </tbody>
220
  </table>