agentlans commited on
Commit
e8b975c
·
verified ·
1 Parent(s): 6a3f3f2

Upload 13 files

Browse files
README.md CHANGED
@@ -20,22 +20,22 @@ model-index:
20
  type: Zyphra/Zyda-2
21
  metrics:
22
  - type: accuracy
23
- value: 0.5387
24
  name: Accuracy
25
- base_model: agentlans/deberta-finewebedu
26
  ---
27
 
28
- # DeBERTa-v3-xsmall-zyda-2
29
 
30
  ## Model Description
31
 
32
- This model is a fine-tuned version of [agentlans/deberta-finewebedu](https://huggingface.co/agentlans/deberta-finewebedu) on a subset of the [Zyphra/Zyda-2](https://huggingface.co/datasets/Zyphra/Zyda-2) dataset. It was trained using the Masked Language Modeling (MLM) objective to enhance its understanding of the English language.
33
 
34
  ## Performance
35
 
36
  The model achieves the following results on the evaluation set:
37
- - Loss: 2.9234
38
- - Accuracy: 0.5387
39
 
40
  ## Intended Uses & Limitations
41
 
@@ -50,7 +50,7 @@ This model is designed to be used and finetuned for the following tasks:
50
 
51
  ## Training Data
52
 
53
- The model was trained on the first 100 000 rows of the [Zyphra/Zyda-2](https://huggingface.co/datasets/Zyphra/Zyda-2) dataset.
54
  5% of that data was used for validation.
55
 
56
  ## Training Procedure
@@ -66,12 +66,12 @@ The following hyperparameters were used during training:
66
  - Learning rate scheduler: Linear
67
  - Number of epochs: 1.0
68
 
69
- ### Framework Versions
70
 
71
- - Transformers: 4.44.2
72
- - PyTorch: 2.5.1+cu124
73
  - Datasets: 3.1.0
74
- - Tokenizers: 0.19.1
75
 
76
  ## Usage Examples
77
 
@@ -110,4 +110,4 @@ As this model is trained on a subset of the Zyda-2 dataset, it may inherit biase
110
 
111
  ## Additional Information
112
 
113
- For more details about the base model, please refer to [agentlans/deberta-finewebedu](https://huggingface.co/agentlans/deberta-finewebedu).
 
20
  type: Zyphra/Zyda-2
21
  metrics:
22
  - type: accuracy
23
+ value: 0.5607
24
  name: Accuracy
25
+ base_model: microsoft/deberta-v3-xsmall
26
  ---
27
 
28
+ # DeBERTa-v3-xsmall-Zyda-2
29
 
30
  ## Model Description
31
 
32
+ This model is a fine-tuned version of [microsoft/deberta-v3-xsmall](https://huggingface.co/microsoft/deberta-v3-xsmall) on a subset of the [Zyphra/Zyda-2](https://huggingface.co/datasets/Zyphra/Zyda-2) dataset. It was trained using the Masked Language Modeling (MLM) objective to enhance its understanding of the English language.
33
 
34
  ## Performance
35
 
36
  The model achieves the following results on the evaluation set:
37
+ - Loss: 2.6347
38
+ - Accuracy: 0.5607
39
 
40
  ## Intended Uses & Limitations
41
 
 
50
 
51
  ## Training Data
52
 
53
+ The model was trained on the first 300 000 rows of the [Zyphra/Zyda-2](https://huggingface.co/datasets/Zyphra/Zyda-2) dataset.
54
  5% of that data was used for validation.
55
 
56
  ## Training Procedure
 
66
  - Learning rate scheduler: Linear
67
  - Number of epochs: 1.0
68
 
69
+ ### Framework versions
70
 
71
+ - Transformers: 4.46.3
72
+ - Pytorch: 2.5.1+cu124
73
  - Datasets: 3.1.0
74
+ - Tokenizers: 0.20.3
75
 
76
  ## Usage Examples
77
 
 
110
 
111
  ## Additional Information
112
 
113
+ For more details about the base model, please refer to [microsoft/deberta-v3-xsmall](https://huggingface.co/microsoft/deberta-v3-xsmall).
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_accuracy": 0.5387296045953106,
4
- "eval_loss": 2.923440933227539,
5
- "eval_runtime": 126.5222,
6
- "eval_samples": 11620,
7
- "eval_samples_per_second": 91.842,
8
- "eval_steps_per_second": 11.484,
9
- "perplexity": 18.60519668247528,
10
- "total_flos": 1.5038202327662592e+16,
11
- "train_loss": 3.3210895868885175,
12
- "train_runtime": 6630.0799,
13
- "train_samples": 226928,
14
- "train_samples_per_second": 34.227,
15
- "train_steps_per_second": 4.278
16
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_accuracy": 0.5606630977618748,
4
+ "eval_loss": 2.634730815887451,
5
+ "eval_runtime": 407.3409,
6
+ "eval_samples": 36612,
7
+ "eval_samples_per_second": 89.88,
8
+ "eval_steps_per_second": 11.236,
9
+ "perplexity": 13.939559650092963,
10
+ "total_flos": 4.634223291773338e+16,
11
+ "train_loss": 3.132497888326312,
12
+ "train_runtime": 20429.0413,
13
+ "train_samples": 699309,
14
+ "train_samples_per_second": 34.231,
15
+ "train_steps_per_second": 4.279
16
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "agentlans/deberta-finewebedu",
3
  "architectures": [
4
  "DebertaV2ForMaskedLM"
5
  ],
@@ -29,7 +29,7 @@
29
  "relative_attention": true,
30
  "share_att_key": true,
31
  "torch_dtype": "float32",
32
- "transformers_version": "4.44.2",
33
  "type_vocab_size": 0,
34
  "vocab_size": 128100
35
  }
 
1
  {
2
+ "_name_or_path": "microsoft/deberta-v3-xsmall",
3
  "architectures": [
4
  "DebertaV2ForMaskedLM"
5
  ],
 
29
  "relative_attention": true,
30
  "share_att_key": true,
31
  "torch_dtype": "float32",
32
+ "transformers_version": "4.46.3",
33
  "type_vocab_size": 0,
34
  "vocab_size": 128100
35
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_accuracy": 0.5387296045953106,
4
- "eval_loss": 2.923440933227539,
5
- "eval_runtime": 126.5222,
6
- "eval_samples": 11620,
7
- "eval_samples_per_second": 91.842,
8
- "eval_steps_per_second": 11.484,
9
- "perplexity": 18.60519668247528
10
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_accuracy": 0.5606630977618748,
4
+ "eval_loss": 2.634730815887451,
5
+ "eval_runtime": 407.3409,
6
+ "eval_samples": 36612,
7
+ "eval_samples_per_second": 89.88,
8
+ "eval_steps_per_second": 11.236,
9
+ "perplexity": 13.939559650092963
10
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5947d8166d7e82611f205b72ba9585b7060868017f4255ccd5ad3405d5e7e9df
3
  size 283860016
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:656a30eef186b57c86bad3c788ba0177fbd70e219eb02d0459ef9eda09df43c2
3
  size 283860016
special_tokens_map.json CHANGED
@@ -1,46 +1,10 @@
1
  {
2
- "bos_token": {
3
- "content": "[CLS]",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "[CLS]",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "[SEP]",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "[MASK]",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "[PAD]",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "[SEP]",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
  "unk_token": {
45
  "content": "[UNK]",
46
  "lstrip": false,
 
1
  {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "unk_token": {
9
  "content": "[UNK]",
10
  "lstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -42,21 +42,17 @@
42
  }
43
  },
44
  "bos_token": "[CLS]",
45
- "clean_up_tokenization_spaces": true,
46
  "cls_token": "[CLS]",
47
  "do_lower_case": false,
48
  "eos_token": "[SEP]",
49
  "mask_token": "[MASK]",
50
- "max_length": 1024,
51
  "model_max_length": 1000000000000000019884624838656,
52
  "pad_token": "[PAD]",
53
  "sep_token": "[SEP]",
54
  "sp_model_kwargs": {},
55
  "split_by_punct": false,
56
- "stride": 0,
57
  "tokenizer_class": "DebertaV2Tokenizer",
58
- "truncation_side": "right",
59
- "truncation_strategy": "longest_first",
60
  "unk_token": "[UNK]",
61
  "vocab_type": "spm"
62
  }
 
42
  }
43
  },
44
  "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
  "cls_token": "[CLS]",
47
  "do_lower_case": false,
48
  "eos_token": "[SEP]",
49
  "mask_token": "[MASK]",
 
50
  "model_max_length": 1000000000000000019884624838656,
51
  "pad_token": "[PAD]",
52
  "sep_token": "[SEP]",
53
  "sp_model_kwargs": {},
54
  "split_by_punct": false,
 
55
  "tokenizer_class": "DebertaV2Tokenizer",
 
 
56
  "unk_token": "[UNK]",
57
  "vocab_type": "spm"
58
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 1.5038202327662592e+16,
4
- "train_loss": 3.3210895868885175,
5
- "train_runtime": 6630.0799,
6
- "train_samples": 226928,
7
- "train_samples_per_second": 34.227,
8
- "train_steps_per_second": 4.278
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 4.634223291773338e+16,
4
+ "train_loss": 3.132497888326312,
5
+ "train_runtime": 20429.0413,
6
+ "train_samples": 699309,
7
+ "train_samples_per_second": 34.231,
8
+ "train_steps_per_second": 4.279
9
  }
trainer_state.json CHANGED
@@ -3,415 +3,1241 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 28366,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.017626736233519003,
13
- "grad_norm": 5.204168796539307,
14
- "learning_rate": 4.9118663188324054e-05,
15
- "loss": 3.9535,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 0.035253472467038006,
20
- "grad_norm": 5.162827968597412,
21
- "learning_rate": 4.82373263766481e-05,
22
- "loss": 3.761,
23
  "step": 1000
24
  },
25
  {
26
- "epoch": 0.052880208700557006,
27
- "grad_norm": 5.309798240661621,
28
- "learning_rate": 4.735598956497215e-05,
29
- "loss": 3.7096,
30
  "step": 1500
31
  },
32
  {
33
- "epoch": 0.07050694493407601,
34
- "grad_norm": 5.0922369956970215,
35
- "learning_rate": 4.64746527532962e-05,
36
- "loss": 3.6577,
37
  "step": 2000
38
  },
39
  {
40
- "epoch": 0.08813368116759501,
41
- "grad_norm": 5.067632675170898,
42
- "learning_rate": 4.559331594162025e-05,
43
- "loss": 3.6288,
44
  "step": 2500
45
  },
46
  {
47
- "epoch": 0.10576041740111401,
48
- "grad_norm": 5.3605475425720215,
49
- "learning_rate": 4.4711979129944304e-05,
50
- "loss": 3.6192,
51
  "step": 3000
52
  },
53
  {
54
- "epoch": 0.12338715363463301,
55
- "grad_norm": 5.510789394378662,
56
- "learning_rate": 4.383064231826835e-05,
57
- "loss": 3.559,
58
  "step": 3500
59
  },
60
  {
61
- "epoch": 0.14101388986815203,
62
- "grad_norm": 5.7333855628967285,
63
- "learning_rate": 4.29493055065924e-05,
64
- "loss": 3.5382,
65
  "step": 4000
66
  },
67
  {
68
- "epoch": 0.158640626101671,
69
- "grad_norm": 5.04295539855957,
70
- "learning_rate": 4.206796869491645e-05,
71
- "loss": 3.4962,
72
  "step": 4500
73
  },
74
  {
75
- "epoch": 0.17626736233519003,
76
- "grad_norm": 4.932398796081543,
77
- "learning_rate": 4.11866318832405e-05,
78
- "loss": 3.5339,
79
  "step": 5000
80
  },
81
  {
82
- "epoch": 0.193894098568709,
83
- "grad_norm": 5.262182235717773,
84
- "learning_rate": 4.0305295071564555e-05,
85
- "loss": 3.4758,
86
  "step": 5500
87
  },
88
  {
89
- "epoch": 0.21152083480222802,
90
- "grad_norm": 5.248316764831543,
91
- "learning_rate": 3.94239582598886e-05,
92
- "loss": 3.4524,
93
  "step": 6000
94
  },
95
  {
96
- "epoch": 0.229147571035747,
97
- "grad_norm": 5.176753520965576,
98
- "learning_rate": 3.854262144821265e-05,
99
- "loss": 3.4403,
100
  "step": 6500
101
  },
102
  {
103
- "epoch": 0.24677430726926602,
104
- "grad_norm": 5.396851539611816,
105
- "learning_rate": 3.76612846365367e-05,
106
- "loss": 3.4066,
107
  "step": 7000
108
  },
109
  {
110
- "epoch": 0.26440104350278504,
111
- "grad_norm": 4.905313968658447,
112
- "learning_rate": 3.677994782486075e-05,
113
- "loss": 3.4277,
114
  "step": 7500
115
  },
116
  {
117
- "epoch": 0.28202777973630405,
118
- "grad_norm": 5.581764221191406,
119
- "learning_rate": 3.58986110131848e-05,
120
- "loss": 3.3977,
121
  "step": 8000
122
  },
123
  {
124
- "epoch": 0.299654515969823,
125
- "grad_norm": 4.564020156860352,
126
- "learning_rate": 3.501727420150885e-05,
127
- "loss": 3.3739,
128
  "step": 8500
129
  },
130
  {
131
- "epoch": 0.317281252203342,
132
- "grad_norm": 5.451286315917969,
133
- "learning_rate": 3.41359373898329e-05,
134
- "loss": 3.3724,
135
  "step": 9000
136
  },
137
  {
138
- "epoch": 0.33490798843686104,
139
- "grad_norm": 5.060819149017334,
140
- "learning_rate": 3.325460057815695e-05,
141
- "loss": 3.3393,
142
  "step": 9500
143
  },
144
  {
145
- "epoch": 0.35253472467038005,
146
- "grad_norm": 5.474411487579346,
147
- "learning_rate": 3.2373263766481e-05,
148
- "loss": 3.3186,
149
  "step": 10000
150
  },
151
  {
152
- "epoch": 0.370161460903899,
153
- "grad_norm": 5.26786994934082,
154
- "learning_rate": 3.149192695480505e-05,
155
- "loss": 3.3223,
156
  "step": 10500
157
  },
158
  {
159
- "epoch": 0.387788197137418,
160
- "grad_norm": 5.467500686645508,
161
- "learning_rate": 3.06105901431291e-05,
162
- "loss": 3.3054,
163
  "step": 11000
164
  },
165
  {
166
- "epoch": 0.40541493337093704,
167
- "grad_norm": 5.263679027557373,
168
- "learning_rate": 2.972925333145315e-05,
169
- "loss": 3.3193,
170
  "step": 11500
171
  },
172
  {
173
- "epoch": 0.42304166960445605,
174
- "grad_norm": 4.835860729217529,
175
- "learning_rate": 2.88479165197772e-05,
176
- "loss": 3.2871,
177
  "step": 12000
178
  },
179
  {
180
- "epoch": 0.44066840583797506,
181
- "grad_norm": 4.88271951675415,
182
- "learning_rate": 2.7966579708101248e-05,
183
- "loss": 3.2783,
184
  "step": 12500
185
  },
186
  {
187
- "epoch": 0.458295142071494,
188
- "grad_norm": 5.228416442871094,
189
- "learning_rate": 2.70852428964253e-05,
190
- "loss": 3.2845,
191
  "step": 13000
192
  },
193
  {
194
- "epoch": 0.47592187830501304,
195
- "grad_norm": 5.097890853881836,
196
- "learning_rate": 2.6203906084749348e-05,
197
- "loss": 3.2731,
198
  "step": 13500
199
  },
200
  {
201
- "epoch": 0.49354861453853205,
202
- "grad_norm": 4.9926066398620605,
203
- "learning_rate": 2.53225692730734e-05,
204
- "loss": 3.27,
205
  "step": 14000
206
  },
207
  {
208
- "epoch": 0.511175350772051,
209
- "grad_norm": 5.329204559326172,
210
- "learning_rate": 2.4441232461397447e-05,
211
- "loss": 3.253,
212
  "step": 14500
213
  },
214
  {
215
- "epoch": 0.5288020870055701,
216
- "grad_norm": 4.740358352661133,
217
- "learning_rate": 2.35598956497215e-05,
218
- "loss": 3.2511,
219
  "step": 15000
220
  },
221
  {
222
- "epoch": 0.546428823239089,
223
- "grad_norm": 5.418153285980225,
224
- "learning_rate": 2.267855883804555e-05,
225
- "loss": 3.2315,
226
  "step": 15500
227
  },
228
  {
229
- "epoch": 0.5640555594726081,
230
- "grad_norm": 4.993420600891113,
231
- "learning_rate": 2.1797222026369598e-05,
232
- "loss": 3.2453,
233
  "step": 16000
234
  },
235
  {
236
- "epoch": 0.5816822957061271,
237
- "grad_norm": 5.474274635314941,
238
- "learning_rate": 2.091588521469365e-05,
239
- "loss": 3.2328,
240
  "step": 16500
241
  },
242
  {
243
- "epoch": 0.599309031939646,
244
- "grad_norm": 4.977609157562256,
245
- "learning_rate": 2.0034548403017698e-05,
246
- "loss": 3.2181,
247
  "step": 17000
248
  },
249
  {
250
- "epoch": 0.6169357681731651,
251
- "grad_norm": 4.982664585113525,
252
- "learning_rate": 1.915321159134175e-05,
253
- "loss": 3.2106,
254
  "step": 17500
255
  },
256
  {
257
- "epoch": 0.634562504406684,
258
- "grad_norm": 5.291051387786865,
259
- "learning_rate": 1.8271874779665797e-05,
260
- "loss": 3.2134,
261
  "step": 18000
262
  },
263
  {
264
- "epoch": 0.652189240640203,
265
- "grad_norm": 5.687000751495361,
266
- "learning_rate": 1.739053796798985e-05,
267
- "loss": 3.1905,
268
  "step": 18500
269
  },
270
  {
271
- "epoch": 0.6698159768737221,
272
- "grad_norm": 5.048547267913818,
273
- "learning_rate": 1.6509201156313897e-05,
274
- "loss": 3.2165,
275
  "step": 19000
276
  },
277
  {
278
- "epoch": 0.687442713107241,
279
- "grad_norm": 5.21890926361084,
280
- "learning_rate": 1.5627864344637945e-05,
281
- "loss": 3.216,
282
  "step": 19500
283
  },
284
  {
285
- "epoch": 0.7050694493407601,
286
- "grad_norm": 4.901352405548096,
287
- "learning_rate": 1.4746527532961998e-05,
288
- "loss": 3.1903,
289
  "step": 20000
290
  },
291
  {
292
- "epoch": 0.7226961855742791,
293
- "grad_norm": 5.835772514343262,
294
- "learning_rate": 1.3865190721286048e-05,
295
- "loss": 3.1971,
296
  "step": 20500
297
  },
298
  {
299
- "epoch": 0.740322921807798,
300
- "grad_norm": 4.900722503662109,
301
- "learning_rate": 1.2983853909610097e-05,
302
- "loss": 3.1832,
303
  "step": 21000
304
  },
305
  {
306
- "epoch": 0.7579496580413171,
307
- "grad_norm": 4.764721870422363,
308
- "learning_rate": 1.2102517097934147e-05,
309
- "loss": 3.1808,
310
  "step": 21500
311
  },
312
  {
313
- "epoch": 0.775576394274836,
314
- "grad_norm": 5.3555731773376465,
315
- "learning_rate": 1.1221180286258197e-05,
316
- "loss": 3.1847,
317
  "step": 22000
318
  },
319
  {
320
- "epoch": 0.7932031305083551,
321
- "grad_norm": 5.72691535949707,
322
- "learning_rate": 1.0339843474582247e-05,
323
- "loss": 3.1689,
324
  "step": 22500
325
  },
326
  {
327
- "epoch": 0.8108298667418741,
328
- "grad_norm": 5.263107776641846,
329
- "learning_rate": 9.458506662906296e-06,
330
- "loss": 3.1666,
331
  "step": 23000
332
  },
333
  {
334
- "epoch": 0.828456602975393,
335
- "grad_norm": 5.273736476898193,
336
- "learning_rate": 8.577169851230346e-06,
337
- "loss": 3.1583,
338
  "step": 23500
339
  },
340
  {
341
- "epoch": 0.8460833392089121,
342
- "grad_norm": 5.418051719665527,
343
- "learning_rate": 7.695833039554396e-06,
344
- "loss": 3.1429,
345
  "step": 24000
346
  },
347
  {
348
- "epoch": 0.8637100754424311,
349
- "grad_norm": 4.837016582489014,
350
- "learning_rate": 6.814496227878446e-06,
351
- "loss": 3.1831,
352
  "step": 24500
353
  },
354
  {
355
- "epoch": 0.8813368116759501,
356
- "grad_norm": 5.3440680503845215,
357
- "learning_rate": 5.933159416202496e-06,
358
- "loss": 3.151,
359
  "step": 25000
360
  },
361
  {
362
- "epoch": 0.8989635479094691,
363
- "grad_norm": 5.674468517303467,
364
- "learning_rate": 5.051822604526546e-06,
365
- "loss": 3.142,
366
  "step": 25500
367
  },
368
  {
369
- "epoch": 0.916590284142988,
370
- "grad_norm": 5.245038986206055,
371
- "learning_rate": 4.170485792850596e-06,
372
- "loss": 3.1537,
373
  "step": 26000
374
  },
375
  {
376
- "epoch": 0.9342170203765071,
377
- "grad_norm": 5.040459632873535,
378
- "learning_rate": 3.289148981174646e-06,
379
- "loss": 3.1496,
380
  "step": 26500
381
  },
382
  {
383
- "epoch": 0.9518437566100261,
384
- "grad_norm": 4.918792724609375,
385
- "learning_rate": 2.4078121694986958e-06,
386
- "loss": 3.1541,
387
  "step": 27000
388
  },
389
  {
390
- "epoch": 0.9694704928435451,
391
- "grad_norm": 5.169427394866943,
392
- "learning_rate": 1.5264753578227457e-06,
393
- "loss": 3.1609,
394
  "step": 27500
395
  },
396
  {
397
- "epoch": 0.9870972290770641,
398
- "grad_norm": 5.406129837036133,
399
- "learning_rate": 6.451385461467955e-07,
400
- "loss": 3.1467,
401
  "step": 28000
402
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  {
404
  "epoch": 1.0,
405
- "step": 28366,
406
- "total_flos": 1.5038202327662592e+16,
407
- "train_loss": 3.3210895868885175,
408
- "train_runtime": 6630.0799,
409
- "train_samples_per_second": 34.227,
410
- "train_steps_per_second": 4.278
411
  }
412
  ],
413
  "logging_steps": 500,
414
- "max_steps": 28366,
415
  "num_input_tokens_seen": 0,
416
  "num_train_epochs": 1,
417
  "save_steps": 500,
@@ -427,7 +1253,7 @@
427
  "attributes": {}
428
  }
429
  },
430
- "total_flos": 1.5038202327662592e+16,
431
  "train_batch_size": 8,
432
  "trial_name": null,
433
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 87414,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.005719907566293729,
13
+ "grad_norm": 7.0386528968811035,
14
+ "learning_rate": 4.971400462168532e-05,
15
+ "loss": 7.2889,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.011439815132587457,
20
+ "grad_norm": 6.592615604400635,
21
+ "learning_rate": 4.942800924337063e-05,
22
+ "loss": 5.389,
23
  "step": 1000
24
  },
25
  {
26
+ "epoch": 0.017159722698881188,
27
+ "grad_norm": 6.088059425354004,
28
+ "learning_rate": 4.914201386505594e-05,
29
+ "loss": 4.8994,
30
  "step": 1500
31
  },
32
  {
33
+ "epoch": 0.022879630265174915,
34
+ "grad_norm": 6.355687618255615,
35
+ "learning_rate": 4.885601848674126e-05,
36
+ "loss": 4.6219,
37
  "step": 2000
38
  },
39
  {
40
+ "epoch": 0.028599537831468645,
41
+ "grad_norm": 6.1159186363220215,
42
+ "learning_rate": 4.8570023108426574e-05,
43
+ "loss": 4.4233,
44
  "step": 2500
45
  },
46
  {
47
+ "epoch": 0.034319445397762376,
48
+ "grad_norm": 6.636866092681885,
49
+ "learning_rate": 4.828402773011189e-05,
50
+ "loss": 4.2802,
51
  "step": 3000
52
  },
53
  {
54
+ "epoch": 0.0400393529640561,
55
+ "grad_norm": 5.87605619430542,
56
+ "learning_rate": 4.7998032351797196e-05,
57
+ "loss": 4.1453,
58
  "step": 3500
59
  },
60
  {
61
+ "epoch": 0.04575926053034983,
62
+ "grad_norm": 6.0966691970825195,
63
+ "learning_rate": 4.771203697348251e-05,
64
+ "loss": 4.0822,
65
  "step": 4000
66
  },
67
  {
68
+ "epoch": 0.05147916809664356,
69
+ "grad_norm": 5.99679708480835,
70
+ "learning_rate": 4.7426041595167824e-05,
71
+ "loss": 3.9941,
72
  "step": 4500
73
  },
74
  {
75
+ "epoch": 0.05719907566293729,
76
+ "grad_norm": 5.738614082336426,
77
+ "learning_rate": 4.714004621685314e-05,
78
+ "loss": 3.9104,
79
  "step": 5000
80
  },
81
  {
82
+ "epoch": 0.06291898322923102,
83
+ "grad_norm": 6.038717746734619,
84
+ "learning_rate": 4.6854050838538446e-05,
85
+ "loss": 3.8528,
86
  "step": 5500
87
  },
88
  {
89
+ "epoch": 0.06863889079552475,
90
+ "grad_norm": 5.92971134185791,
91
+ "learning_rate": 4.656805546022377e-05,
92
+ "loss": 3.78,
93
  "step": 6000
94
  },
95
  {
96
+ "epoch": 0.07435879836181847,
97
+ "grad_norm": 5.791173458099365,
98
+ "learning_rate": 4.628206008190908e-05,
99
+ "loss": 3.7407,
100
  "step": 6500
101
  },
102
  {
103
+ "epoch": 0.0800787059281122,
104
+ "grad_norm": 6.2113566398620605,
105
+ "learning_rate": 4.5996064703594396e-05,
106
+ "loss": 3.6834,
107
  "step": 7000
108
  },
109
  {
110
+ "epoch": 0.08579861349440593,
111
+ "grad_norm": 5.970207214355469,
112
+ "learning_rate": 4.57100693252797e-05,
113
+ "loss": 3.663,
114
  "step": 7500
115
  },
116
  {
117
+ "epoch": 0.09151852106069966,
118
+ "grad_norm": 5.8883514404296875,
119
+ "learning_rate": 4.542407394696502e-05,
120
+ "loss": 3.6161,
121
  "step": 8000
122
  },
123
  {
124
+ "epoch": 0.09723842862699339,
125
+ "grad_norm": 6.351154804229736,
126
+ "learning_rate": 4.513807856865033e-05,
127
+ "loss": 3.6193,
128
  "step": 8500
129
  },
130
  {
131
+ "epoch": 0.10295833619328712,
132
+ "grad_norm": 5.69071102142334,
133
+ "learning_rate": 4.4852083190335646e-05,
134
+ "loss": 3.545,
135
  "step": 9000
136
  },
137
  {
138
+ "epoch": 0.10867824375958085,
139
+ "grad_norm": 5.988426208496094,
140
+ "learning_rate": 4.456608781202096e-05,
141
+ "loss": 3.5088,
142
  "step": 9500
143
  },
144
  {
145
+ "epoch": 0.11439815132587458,
146
+ "grad_norm": 6.251087665557861,
147
+ "learning_rate": 4.4280092433706274e-05,
148
+ "loss": 3.5158,
149
  "step": 10000
150
  },
151
  {
152
+ "epoch": 0.1201180588921683,
153
+ "grad_norm": 5.840632438659668,
154
+ "learning_rate": 4.399409705539159e-05,
155
+ "loss": 3.5045,
156
  "step": 10500
157
  },
158
  {
159
+ "epoch": 0.12583796645846204,
160
+ "grad_norm": 5.9697723388671875,
161
+ "learning_rate": 4.37081016770769e-05,
162
+ "loss": 3.4466,
163
  "step": 11000
164
  },
165
  {
166
+ "epoch": 0.13155787402475577,
167
+ "grad_norm": 6.149275779724121,
168
+ "learning_rate": 4.342210629876222e-05,
169
+ "loss": 3.4544,
170
  "step": 11500
171
  },
172
  {
173
+ "epoch": 0.1372777815910495,
174
+ "grad_norm": 5.605130672454834,
175
+ "learning_rate": 4.3136110920447525e-05,
176
+ "loss": 3.4116,
177
  "step": 12000
178
  },
179
  {
180
+ "epoch": 0.1429976891573432,
181
+ "grad_norm": 5.5137152671813965,
182
+ "learning_rate": 4.285011554213284e-05,
183
+ "loss": 3.3919,
184
  "step": 12500
185
  },
186
  {
187
+ "epoch": 0.14871759672363694,
188
+ "grad_norm": 5.848428726196289,
189
+ "learning_rate": 4.256412016381815e-05,
190
+ "loss": 3.3695,
191
  "step": 13000
192
  },
193
  {
194
+ "epoch": 0.15443750428993067,
195
+ "grad_norm": 5.794093132019043,
196
+ "learning_rate": 4.227812478550347e-05,
197
+ "loss": 3.3509,
198
  "step": 13500
199
  },
200
  {
201
+ "epoch": 0.1601574118562244,
202
+ "grad_norm": 5.879131317138672,
203
+ "learning_rate": 4.199212940718878e-05,
204
+ "loss": 3.3551,
205
  "step": 14000
206
  },
207
  {
208
+ "epoch": 0.16587731942251813,
209
+ "grad_norm": 5.5501179695129395,
210
+ "learning_rate": 4.1706134028874096e-05,
211
+ "loss": 3.3327,
212
  "step": 14500
213
  },
214
  {
215
+ "epoch": 0.17159722698881186,
216
+ "grad_norm": 5.30273962020874,
217
+ "learning_rate": 4.142013865055941e-05,
218
+ "loss": 3.3045,
219
  "step": 15000
220
  },
221
  {
222
+ "epoch": 0.1773171345551056,
223
+ "grad_norm": 6.049214839935303,
224
+ "learning_rate": 4.1134143272244725e-05,
225
+ "loss": 3.2868,
226
  "step": 15500
227
  },
228
  {
229
+ "epoch": 0.18303704212139932,
230
+ "grad_norm": 5.499197483062744,
231
+ "learning_rate": 4.084814789393003e-05,
232
+ "loss": 3.2837,
233
  "step": 16000
234
  },
235
  {
236
+ "epoch": 0.18875694968769305,
237
+ "grad_norm": 5.448641777038574,
238
+ "learning_rate": 4.0562152515615347e-05,
239
+ "loss": 3.2751,
240
  "step": 16500
241
  },
242
  {
243
+ "epoch": 0.19447685725398678,
244
+ "grad_norm": 5.422529697418213,
245
+ "learning_rate": 4.027615713730066e-05,
246
+ "loss": 3.2585,
247
  "step": 17000
248
  },
249
  {
250
+ "epoch": 0.2001967648202805,
251
+ "grad_norm": 5.639166831970215,
252
+ "learning_rate": 3.9990161758985975e-05,
253
+ "loss": 3.2587,
254
  "step": 17500
255
  },
256
  {
257
+ "epoch": 0.20591667238657424,
258
+ "grad_norm": 5.255904674530029,
259
+ "learning_rate": 3.970416638067129e-05,
260
+ "loss": 3.2505,
261
  "step": 18000
262
  },
263
  {
264
+ "epoch": 0.21163657995286797,
265
+ "grad_norm": 5.453869819641113,
266
+ "learning_rate": 3.9418171002356604e-05,
267
+ "loss": 3.2382,
268
  "step": 18500
269
  },
270
  {
271
+ "epoch": 0.2173564875191617,
272
+ "grad_norm": 5.905749797821045,
273
+ "learning_rate": 3.913217562404192e-05,
274
+ "loss": 3.2309,
275
  "step": 19000
276
  },
277
  {
278
+ "epoch": 0.22307639508545543,
279
+ "grad_norm": 5.28553581237793,
280
+ "learning_rate": 3.884618024572723e-05,
281
+ "loss": 3.2308,
282
  "step": 19500
283
  },
284
  {
285
+ "epoch": 0.22879630265174916,
286
+ "grad_norm": 5.1942830085754395,
287
+ "learning_rate": 3.8560184867412547e-05,
288
+ "loss": 3.2163,
289
  "step": 20000
290
  },
291
  {
292
+ "epoch": 0.23451621021804286,
293
+ "grad_norm": 6.12723970413208,
294
+ "learning_rate": 3.8274189489097854e-05,
295
+ "loss": 3.2082,
296
  "step": 20500
297
  },
298
  {
299
+ "epoch": 0.2402361177843366,
300
+ "grad_norm": 5.664548873901367,
301
+ "learning_rate": 3.798819411078317e-05,
302
+ "loss": 3.1802,
303
  "step": 21000
304
  },
305
  {
306
+ "epoch": 0.24595602535063033,
307
+ "grad_norm": 5.903208255767822,
308
+ "learning_rate": 3.770219873246848e-05,
309
+ "loss": 3.1946,
310
  "step": 21500
311
  },
312
  {
313
+ "epoch": 0.2516759329169241,
314
+ "grad_norm": 5.729937553405762,
315
+ "learning_rate": 3.7416203354153804e-05,
316
+ "loss": 3.1646,
317
  "step": 22000
318
  },
319
  {
320
+ "epoch": 0.2573958404832178,
321
+ "grad_norm": 6.068752765655518,
322
+ "learning_rate": 3.713020797583911e-05,
323
+ "loss": 3.2006,
324
  "step": 22500
325
  },
326
  {
327
+ "epoch": 0.26311574804951154,
328
+ "grad_norm": 5.97099494934082,
329
+ "learning_rate": 3.6844212597524425e-05,
330
+ "loss": 3.1437,
331
  "step": 23000
332
  },
333
  {
334
+ "epoch": 0.26883565561580525,
335
+ "grad_norm": 5.777164936065674,
336
+ "learning_rate": 3.655821721920974e-05,
337
+ "loss": 3.1407,
338
  "step": 23500
339
  },
340
  {
341
+ "epoch": 0.274555563182099,
342
+ "grad_norm": 4.982606887817383,
343
+ "learning_rate": 3.6272221840895054e-05,
344
+ "loss": 3.1282,
345
  "step": 24000
346
  },
347
  {
348
+ "epoch": 0.2802754707483927,
349
+ "grad_norm": 5.495816230773926,
350
+ "learning_rate": 3.598622646258037e-05,
351
+ "loss": 3.1434,
352
  "step": 24500
353
  },
354
  {
355
+ "epoch": 0.2859953783146864,
356
+ "grad_norm": 5.898298263549805,
357
+ "learning_rate": 3.5700231084265676e-05,
358
+ "loss": 3.1446,
359
  "step": 25000
360
  },
361
  {
362
+ "epoch": 0.29171528588098017,
363
+ "grad_norm": 5.729229927062988,
364
+ "learning_rate": 3.541423570595099e-05,
365
+ "loss": 3.1254,
366
  "step": 25500
367
  },
368
  {
369
+ "epoch": 0.29743519344727387,
370
+ "grad_norm": 6.3333821296691895,
371
+ "learning_rate": 3.512824032763631e-05,
372
+ "loss": 3.1543,
373
  "step": 26000
374
  },
375
  {
376
+ "epoch": 0.30315510101356763,
377
+ "grad_norm": 6.0027756690979,
378
+ "learning_rate": 3.4842244949321625e-05,
379
+ "loss": 3.1379,
380
  "step": 26500
381
  },
382
  {
383
+ "epoch": 0.30887500857986133,
384
+ "grad_norm": 5.95717716217041,
385
+ "learning_rate": 3.455624957100693e-05,
386
+ "loss": 3.1188,
387
  "step": 27000
388
  },
389
  {
390
+ "epoch": 0.3145949161461551,
391
+ "grad_norm": 6.262216567993164,
392
+ "learning_rate": 3.427025419269225e-05,
393
+ "loss": 3.1096,
394
  "step": 27500
395
  },
396
  {
397
+ "epoch": 0.3203148237124488,
398
+ "grad_norm": 6.436416149139404,
399
+ "learning_rate": 3.398425881437756e-05,
400
+ "loss": 3.1294,
401
  "step": 28000
402
  },
403
+ {
404
+ "epoch": 0.32603473127874255,
405
+ "grad_norm": 5.524046421051025,
406
+ "learning_rate": 3.3698263436062876e-05,
407
+ "loss": 3.0965,
408
+ "step": 28500
409
+ },
410
+ {
411
+ "epoch": 0.33175463884503625,
412
+ "grad_norm": 5.237400531768799,
413
+ "learning_rate": 3.341226805774818e-05,
414
+ "loss": 3.0826,
415
+ "step": 29000
416
+ },
417
+ {
418
+ "epoch": 0.33747454641133,
419
+ "grad_norm": 5.551352500915527,
420
+ "learning_rate": 3.3126272679433504e-05,
421
+ "loss": 3.0666,
422
+ "step": 29500
423
+ },
424
+ {
425
+ "epoch": 0.3431944539776237,
426
+ "grad_norm": 5.407064914703369,
427
+ "learning_rate": 3.284027730111882e-05,
428
+ "loss": 3.0738,
429
+ "step": 30000
430
+ },
431
+ {
432
+ "epoch": 0.3489143615439175,
433
+ "grad_norm": 5.428358554840088,
434
+ "learning_rate": 3.255428192280413e-05,
435
+ "loss": 3.0628,
436
+ "step": 30500
437
+ },
438
+ {
439
+ "epoch": 0.3546342691102112,
440
+ "grad_norm": 6.178744792938232,
441
+ "learning_rate": 3.226828654448944e-05,
442
+ "loss": 3.0683,
443
+ "step": 31000
444
+ },
445
+ {
446
+ "epoch": 0.36035417667650493,
447
+ "grad_norm": 5.695249080657959,
448
+ "learning_rate": 3.1982291166174755e-05,
449
+ "loss": 3.0732,
450
+ "step": 31500
451
+ },
452
+ {
453
+ "epoch": 0.36607408424279864,
454
+ "grad_norm": 5.676379203796387,
455
+ "learning_rate": 3.169629578786007e-05,
456
+ "loss": 3.0531,
457
+ "step": 32000
458
+ },
459
+ {
460
+ "epoch": 0.37179399180909234,
461
+ "grad_norm": 5.420720100402832,
462
+ "learning_rate": 3.141030040954538e-05,
463
+ "loss": 3.0472,
464
+ "step": 32500
465
+ },
466
+ {
467
+ "epoch": 0.3775138993753861,
468
+ "grad_norm": 5.6645379066467285,
469
+ "learning_rate": 3.11243050312307e-05,
470
+ "loss": 3.0391,
471
+ "step": 33000
472
+ },
473
+ {
474
+ "epoch": 0.3832338069416798,
475
+ "grad_norm": 6.123884201049805,
476
+ "learning_rate": 3.083830965291601e-05,
477
+ "loss": 3.0527,
478
+ "step": 33500
479
+ },
480
+ {
481
+ "epoch": 0.38895371450797356,
482
+ "grad_norm": 5.331460475921631,
483
+ "learning_rate": 3.0552314274601326e-05,
484
+ "loss": 3.049,
485
+ "step": 34000
486
+ },
487
+ {
488
+ "epoch": 0.39467362207426726,
489
+ "grad_norm": 6.356675624847412,
490
+ "learning_rate": 3.0266318896286637e-05,
491
+ "loss": 3.0416,
492
+ "step": 34500
493
+ },
494
+ {
495
+ "epoch": 0.400393529640561,
496
+ "grad_norm": 6.067440986633301,
497
+ "learning_rate": 2.9980323517971955e-05,
498
+ "loss": 3.0303,
499
+ "step": 35000
500
+ },
501
+ {
502
+ "epoch": 0.4061134372068547,
503
+ "grad_norm": 5.3762030601501465,
504
+ "learning_rate": 2.9694328139657262e-05,
505
+ "loss": 3.0363,
506
+ "step": 35500
507
+ },
508
+ {
509
+ "epoch": 0.4118333447731485,
510
+ "grad_norm": 5.837817192077637,
511
+ "learning_rate": 2.9408332761342576e-05,
512
+ "loss": 3.0284,
513
+ "step": 36000
514
+ },
515
+ {
516
+ "epoch": 0.4175532523394422,
517
+ "grad_norm": 5.484166622161865,
518
+ "learning_rate": 2.912233738302789e-05,
519
+ "loss": 2.9946,
520
+ "step": 36500
521
+ },
522
+ {
523
+ "epoch": 0.42327315990573594,
524
+ "grad_norm": 5.776547908782959,
525
+ "learning_rate": 2.883634200471321e-05,
526
+ "loss": 3.037,
527
+ "step": 37000
528
+ },
529
+ {
530
+ "epoch": 0.42899306747202964,
531
+ "grad_norm": 5.481433868408203,
532
+ "learning_rate": 2.8550346626398516e-05,
533
+ "loss": 3.0289,
534
+ "step": 37500
535
+ },
536
+ {
537
+ "epoch": 0.4347129750383234,
538
+ "grad_norm": 5.784084320068359,
539
+ "learning_rate": 2.826435124808383e-05,
540
+ "loss": 2.9928,
541
+ "step": 38000
542
+ },
543
+ {
544
+ "epoch": 0.4404328826046171,
545
+ "grad_norm": 5.899621486663818,
546
+ "learning_rate": 2.7978355869769148e-05,
547
+ "loss": 3.0009,
548
+ "step": 38500
549
+ },
550
+ {
551
+ "epoch": 0.44615279017091086,
552
+ "grad_norm": 5.488452434539795,
553
+ "learning_rate": 2.7692360491454462e-05,
554
+ "loss": 3.0099,
555
+ "step": 39000
556
+ },
557
+ {
558
+ "epoch": 0.45187269773720457,
559
+ "grad_norm": 5.848759174346924,
560
+ "learning_rate": 2.740636511313977e-05,
561
+ "loss": 3.0009,
562
+ "step": 39500
563
+ },
564
+ {
565
+ "epoch": 0.4575926053034983,
566
+ "grad_norm": 5.612068176269531,
567
+ "learning_rate": 2.7120369734825084e-05,
568
+ "loss": 2.9828,
569
+ "step": 40000
570
+ },
571
+ {
572
+ "epoch": 0.463312512869792,
573
+ "grad_norm": 5.79826021194458,
574
+ "learning_rate": 2.68343743565104e-05,
575
+ "loss": 3.0152,
576
+ "step": 40500
577
+ },
578
+ {
579
+ "epoch": 0.46903242043608573,
580
+ "grad_norm": 5.842123508453369,
581
+ "learning_rate": 2.6548378978195716e-05,
582
+ "loss": 2.9719,
583
+ "step": 41000
584
+ },
585
+ {
586
+ "epoch": 0.4747523280023795,
587
+ "grad_norm": 5.69782018661499,
588
+ "learning_rate": 2.626238359988103e-05,
589
+ "loss": 2.975,
590
+ "step": 41500
591
+ },
592
+ {
593
+ "epoch": 0.4804722355686732,
594
+ "grad_norm": 6.189919948577881,
595
+ "learning_rate": 2.5976388221566338e-05,
596
+ "loss": 2.9709,
597
+ "step": 42000
598
+ },
599
+ {
600
+ "epoch": 0.48619214313496695,
601
+ "grad_norm": 5.761579513549805,
602
+ "learning_rate": 2.5690392843251655e-05,
603
+ "loss": 2.9583,
604
+ "step": 42500
605
+ },
606
+ {
607
+ "epoch": 0.49191205070126065,
608
+ "grad_norm": 6.164900779724121,
609
+ "learning_rate": 2.540439746493697e-05,
610
+ "loss": 2.9742,
611
+ "step": 43000
612
+ },
613
+ {
614
+ "epoch": 0.4976319582675544,
615
+ "grad_norm": 5.3809285163879395,
616
+ "learning_rate": 2.5118402086622284e-05,
617
+ "loss": 2.9849,
618
+ "step": 43500
619
+ },
620
+ {
621
+ "epoch": 0.5033518658338482,
622
+ "grad_norm": 5.787545680999756,
623
+ "learning_rate": 2.4832406708307595e-05,
624
+ "loss": 2.963,
625
+ "step": 44000
626
+ },
627
+ {
628
+ "epoch": 0.5090717734001419,
629
+ "grad_norm": 5.825649261474609,
630
+ "learning_rate": 2.454641132999291e-05,
631
+ "loss": 2.9634,
632
+ "step": 44500
633
+ },
634
+ {
635
+ "epoch": 0.5147916809664356,
636
+ "grad_norm": 5.936666488647461,
637
+ "learning_rate": 2.4260415951678223e-05,
638
+ "loss": 2.9678,
639
+ "step": 45000
640
+ },
641
+ {
642
+ "epoch": 0.5205115885327293,
643
+ "grad_norm": 5.980503082275391,
644
+ "learning_rate": 2.3974420573363534e-05,
645
+ "loss": 2.9675,
646
+ "step": 45500
647
+ },
648
+ {
649
+ "epoch": 0.5262314960990231,
650
+ "grad_norm": 5.755555629730225,
651
+ "learning_rate": 2.368842519504885e-05,
652
+ "loss": 2.9597,
653
+ "step": 46000
654
+ },
655
+ {
656
+ "epoch": 0.5319514036653168,
657
+ "grad_norm": 5.1978936195373535,
658
+ "learning_rate": 2.3402429816734163e-05,
659
+ "loss": 2.947,
660
+ "step": 46500
661
+ },
662
+ {
663
+ "epoch": 0.5376713112316105,
664
+ "grad_norm": 5.265974521636963,
665
+ "learning_rate": 2.3116434438419477e-05,
666
+ "loss": 2.9521,
667
+ "step": 47000
668
+ },
669
+ {
670
+ "epoch": 0.5433912187979042,
671
+ "grad_norm": 6.028165340423584,
672
+ "learning_rate": 2.2830439060104788e-05,
673
+ "loss": 2.9579,
674
+ "step": 47500
675
+ },
676
+ {
677
+ "epoch": 0.549111126364198,
678
+ "grad_norm": 5.533000946044922,
679
+ "learning_rate": 2.2544443681790102e-05,
680
+ "loss": 2.9541,
681
+ "step": 48000
682
+ },
683
+ {
684
+ "epoch": 0.5548310339304917,
685
+ "grad_norm": 5.428481101989746,
686
+ "learning_rate": 2.225844830347542e-05,
687
+ "loss": 2.9532,
688
+ "step": 48500
689
+ },
690
+ {
691
+ "epoch": 0.5605509414967854,
692
+ "grad_norm": 5.905336856842041,
693
+ "learning_rate": 2.197245292516073e-05,
694
+ "loss": 2.94,
695
+ "step": 49000
696
+ },
697
+ {
698
+ "epoch": 0.5662708490630791,
699
+ "grad_norm": 6.032477855682373,
700
+ "learning_rate": 2.1686457546846045e-05,
701
+ "loss": 2.9435,
702
+ "step": 49500
703
+ },
704
+ {
705
+ "epoch": 0.5719907566293728,
706
+ "grad_norm": 5.996410369873047,
707
+ "learning_rate": 2.1400462168531356e-05,
708
+ "loss": 2.9358,
709
+ "step": 50000
710
+ },
711
+ {
712
+ "epoch": 0.5777106641956666,
713
+ "grad_norm": 5.634001731872559,
714
+ "learning_rate": 2.1114466790216674e-05,
715
+ "loss": 2.9361,
716
+ "step": 50500
717
+ },
718
+ {
719
+ "epoch": 0.5834305717619603,
720
+ "grad_norm": 5.509332656860352,
721
+ "learning_rate": 2.0828471411901985e-05,
722
+ "loss": 2.9314,
723
+ "step": 51000
724
+ },
725
+ {
726
+ "epoch": 0.589150479328254,
727
+ "grad_norm": 6.294771194458008,
728
+ "learning_rate": 2.05424760335873e-05,
729
+ "loss": 2.9238,
730
+ "step": 51500
731
+ },
732
+ {
733
+ "epoch": 0.5948703868945477,
734
+ "grad_norm": 5.542776107788086,
735
+ "learning_rate": 2.025648065527261e-05,
736
+ "loss": 2.9226,
737
+ "step": 52000
738
+ },
739
+ {
740
+ "epoch": 0.6005902944608416,
741
+ "grad_norm": 5.870414733886719,
742
+ "learning_rate": 1.9970485276957927e-05,
743
+ "loss": 2.9202,
744
+ "step": 52500
745
+ },
746
+ {
747
+ "epoch": 0.6063102020271353,
748
+ "grad_norm": 6.047051429748535,
749
+ "learning_rate": 1.9684489898643238e-05,
750
+ "loss": 2.9285,
751
+ "step": 53000
752
+ },
753
+ {
754
+ "epoch": 0.612030109593429,
755
+ "grad_norm": 5.594234943389893,
756
+ "learning_rate": 1.9398494520328553e-05,
757
+ "loss": 2.9307,
758
+ "step": 53500
759
+ },
760
+ {
761
+ "epoch": 0.6177500171597227,
762
+ "grad_norm": 5.4298295974731445,
763
+ "learning_rate": 1.9112499142013863e-05,
764
+ "loss": 2.9382,
765
+ "step": 54000
766
+ },
767
+ {
768
+ "epoch": 0.6234699247260165,
769
+ "grad_norm": 6.184563636779785,
770
+ "learning_rate": 1.882650376369918e-05,
771
+ "loss": 2.9341,
772
+ "step": 54500
773
+ },
774
+ {
775
+ "epoch": 0.6291898322923102,
776
+ "grad_norm": 5.776815414428711,
777
+ "learning_rate": 1.8540508385384492e-05,
778
+ "loss": 2.9287,
779
+ "step": 55000
780
+ },
781
+ {
782
+ "epoch": 0.6349097398586039,
783
+ "grad_norm": 5.83139181137085,
784
+ "learning_rate": 1.8254513007069806e-05,
785
+ "loss": 2.9007,
786
+ "step": 55500
787
+ },
788
+ {
789
+ "epoch": 0.6406296474248976,
790
+ "grad_norm": 5.469008922576904,
791
+ "learning_rate": 1.7968517628755117e-05,
792
+ "loss": 2.904,
793
+ "step": 56000
794
+ },
795
+ {
796
+ "epoch": 0.6463495549911913,
797
+ "grad_norm": 6.898833751678467,
798
+ "learning_rate": 1.7682522250440435e-05,
799
+ "loss": 2.9105,
800
+ "step": 56500
801
+ },
802
+ {
803
+ "epoch": 0.6520694625574851,
804
+ "grad_norm": 5.798022747039795,
805
+ "learning_rate": 1.739652687212575e-05,
806
+ "loss": 2.8995,
807
+ "step": 57000
808
+ },
809
+ {
810
+ "epoch": 0.6577893701237788,
811
+ "grad_norm": 5.57025146484375,
812
+ "learning_rate": 1.711053149381106e-05,
813
+ "loss": 2.9093,
814
+ "step": 57500
815
+ },
816
+ {
817
+ "epoch": 0.6635092776900725,
818
+ "grad_norm": 5.779621124267578,
819
+ "learning_rate": 1.6824536115496374e-05,
820
+ "loss": 2.9118,
821
+ "step": 58000
822
+ },
823
+ {
824
+ "epoch": 0.6692291852563662,
825
+ "grad_norm": 5.683529853820801,
826
+ "learning_rate": 1.653854073718169e-05,
827
+ "loss": 2.9102,
828
+ "step": 58500
829
+ },
830
+ {
831
+ "epoch": 0.67494909282266,
832
+ "grad_norm": 6.1109538078308105,
833
+ "learning_rate": 1.6252545358867003e-05,
834
+ "loss": 2.9102,
835
+ "step": 59000
836
+ },
837
+ {
838
+ "epoch": 0.6806690003889537,
839
+ "grad_norm": 5.536868095397949,
840
+ "learning_rate": 1.5966549980552314e-05,
841
+ "loss": 2.8966,
842
+ "step": 59500
843
+ },
844
+ {
845
+ "epoch": 0.6863889079552474,
846
+ "grad_norm": 6.560556888580322,
847
+ "learning_rate": 1.5680554602237628e-05,
848
+ "loss": 2.8896,
849
+ "step": 60000
850
+ },
851
+ {
852
+ "epoch": 0.6921088155215411,
853
+ "grad_norm": 5.969814300537109,
854
+ "learning_rate": 1.5394559223922942e-05,
855
+ "loss": 2.8961,
856
+ "step": 60500
857
+ },
858
+ {
859
+ "epoch": 0.697828723087835,
860
+ "grad_norm": 5.238883018493652,
861
+ "learning_rate": 1.5108563845608257e-05,
862
+ "loss": 2.882,
863
+ "step": 61000
864
+ },
865
+ {
866
+ "epoch": 0.7035486306541286,
867
+ "grad_norm": 5.538156509399414,
868
+ "learning_rate": 1.4822568467293567e-05,
869
+ "loss": 2.8955,
870
+ "step": 61500
871
+ },
872
+ {
873
+ "epoch": 0.7092685382204224,
874
+ "grad_norm": 6.287049770355225,
875
+ "learning_rate": 1.4536573088978883e-05,
876
+ "loss": 2.8932,
877
+ "step": 62000
878
+ },
879
+ {
880
+ "epoch": 0.714988445786716,
881
+ "grad_norm": 6.05150032043457,
882
+ "learning_rate": 1.4250577710664196e-05,
883
+ "loss": 2.8868,
884
+ "step": 62500
885
+ },
886
+ {
887
+ "epoch": 0.7207083533530099,
888
+ "grad_norm": 5.857907295227051,
889
+ "learning_rate": 1.396458233234951e-05,
890
+ "loss": 2.8872,
891
+ "step": 63000
892
+ },
893
+ {
894
+ "epoch": 0.7264282609193036,
895
+ "grad_norm": 5.748091697692871,
896
+ "learning_rate": 1.3678586954034823e-05,
897
+ "loss": 2.8964,
898
+ "step": 63500
899
+ },
900
+ {
901
+ "epoch": 0.7321481684855973,
902
+ "grad_norm": 5.55509614944458,
903
+ "learning_rate": 1.3392591575720137e-05,
904
+ "loss": 2.8758,
905
+ "step": 64000
906
+ },
907
+ {
908
+ "epoch": 0.737868076051891,
909
+ "grad_norm": 6.135568618774414,
910
+ "learning_rate": 1.3106596197405451e-05,
911
+ "loss": 2.8685,
912
+ "step": 64500
913
+ },
914
+ {
915
+ "epoch": 0.7435879836181847,
916
+ "grad_norm": 5.816187381744385,
917
+ "learning_rate": 1.2820600819090764e-05,
918
+ "loss": 2.8802,
919
+ "step": 65000
920
+ },
921
+ {
922
+ "epoch": 0.7493078911844785,
923
+ "grad_norm": 6.309732437133789,
924
+ "learning_rate": 1.2534605440776078e-05,
925
+ "loss": 2.8859,
926
+ "step": 65500
927
+ },
928
+ {
929
+ "epoch": 0.7550277987507722,
930
+ "grad_norm": 5.721366882324219,
931
+ "learning_rate": 1.2248610062461391e-05,
932
+ "loss": 2.8702,
933
+ "step": 66000
934
+ },
935
+ {
936
+ "epoch": 0.7607477063170659,
937
+ "grad_norm": 5.648174285888672,
938
+ "learning_rate": 1.1962614684146704e-05,
939
+ "loss": 2.8603,
940
+ "step": 66500
941
+ },
942
+ {
943
+ "epoch": 0.7664676138833596,
944
+ "grad_norm": 6.4422688484191895,
945
+ "learning_rate": 1.1676619305832018e-05,
946
+ "loss": 2.875,
947
+ "step": 67000
948
+ },
949
+ {
950
+ "epoch": 0.7721875214496534,
951
+ "grad_norm": 5.673862934112549,
952
+ "learning_rate": 1.1390623927517332e-05,
953
+ "loss": 2.8743,
954
+ "step": 67500
955
+ },
956
+ {
957
+ "epoch": 0.7779074290159471,
958
+ "grad_norm": 5.87379789352417,
959
+ "learning_rate": 1.1104628549202645e-05,
960
+ "loss": 2.8678,
961
+ "step": 68000
962
+ },
963
+ {
964
+ "epoch": 0.7836273365822408,
965
+ "grad_norm": 5.968353271484375,
966
+ "learning_rate": 1.0818633170887959e-05,
967
+ "loss": 2.8797,
968
+ "step": 68500
969
+ },
970
+ {
971
+ "epoch": 0.7893472441485345,
972
+ "grad_norm": 5.516451835632324,
973
+ "learning_rate": 1.0532637792573273e-05,
974
+ "loss": 2.8716,
975
+ "step": 69000
976
+ },
977
+ {
978
+ "epoch": 0.7950671517148283,
979
+ "grad_norm": 6.277103900909424,
980
+ "learning_rate": 1.0246642414258586e-05,
981
+ "loss": 2.8483,
982
+ "step": 69500
983
+ },
984
+ {
985
+ "epoch": 0.800787059281122,
986
+ "grad_norm": 5.54793643951416,
987
+ "learning_rate": 9.9606470359439e-06,
988
+ "loss": 2.8462,
989
+ "step": 70000
990
+ },
991
+ {
992
+ "epoch": 0.8065069668474157,
993
+ "grad_norm": 5.989738464355469,
994
+ "learning_rate": 9.674651657629213e-06,
995
+ "loss": 2.8672,
996
+ "step": 70500
997
+ },
998
+ {
999
+ "epoch": 0.8122268744137094,
1000
+ "grad_norm": 5.7795844078063965,
1001
+ "learning_rate": 9.388656279314527e-06,
1002
+ "loss": 2.8566,
1003
+ "step": 71000
1004
+ },
1005
+ {
1006
+ "epoch": 0.8179467819800033,
1007
+ "grad_norm": 5.732882976531982,
1008
+ "learning_rate": 9.10266090099984e-06,
1009
+ "loss": 2.8497,
1010
+ "step": 71500
1011
+ },
1012
+ {
1013
+ "epoch": 0.823666689546297,
1014
+ "grad_norm": 6.427890777587891,
1015
+ "learning_rate": 8.816665522685154e-06,
1016
+ "loss": 2.8656,
1017
+ "step": 72000
1018
+ },
1019
+ {
1020
+ "epoch": 0.8293865971125907,
1021
+ "grad_norm": 6.1445746421813965,
1022
+ "learning_rate": 8.530670144370468e-06,
1023
+ "loss": 2.8425,
1024
+ "step": 72500
1025
+ },
1026
+ {
1027
+ "epoch": 0.8351065046788844,
1028
+ "grad_norm": 6.342021465301514,
1029
+ "learning_rate": 8.24467476605578e-06,
1030
+ "loss": 2.859,
1031
+ "step": 73000
1032
+ },
1033
+ {
1034
+ "epoch": 0.8408264122451781,
1035
+ "grad_norm": 6.117573261260986,
1036
+ "learning_rate": 7.958679387741095e-06,
1037
+ "loss": 2.8631,
1038
+ "step": 73500
1039
+ },
1040
+ {
1041
+ "epoch": 0.8465463198114719,
1042
+ "grad_norm": 6.145172595977783,
1043
+ "learning_rate": 7.672684009426408e-06,
1044
+ "loss": 2.8536,
1045
+ "step": 74000
1046
+ },
1047
+ {
1048
+ "epoch": 0.8522662273777656,
1049
+ "grad_norm": 5.709812641143799,
1050
+ "learning_rate": 7.386688631111721e-06,
1051
+ "loss": 2.8546,
1052
+ "step": 74500
1053
+ },
1054
+ {
1055
+ "epoch": 0.8579861349440593,
1056
+ "grad_norm": 6.244381904602051,
1057
+ "learning_rate": 7.100693252797034e-06,
1058
+ "loss": 2.8277,
1059
+ "step": 75000
1060
+ },
1061
+ {
1062
+ "epoch": 0.863706042510353,
1063
+ "grad_norm": 6.229698181152344,
1064
+ "learning_rate": 6.814697874482348e-06,
1065
+ "loss": 2.85,
1066
+ "step": 75500
1067
+ },
1068
+ {
1069
+ "epoch": 0.8694259500766468,
1070
+ "grad_norm": 5.480854511260986,
1071
+ "learning_rate": 6.528702496167661e-06,
1072
+ "loss": 2.8576,
1073
+ "step": 76000
1074
+ },
1075
+ {
1076
+ "epoch": 0.8751458576429405,
1077
+ "grad_norm": 6.088413715362549,
1078
+ "learning_rate": 6.2427071178529756e-06,
1079
+ "loss": 2.8618,
1080
+ "step": 76500
1081
+ },
1082
+ {
1083
+ "epoch": 0.8808657652092342,
1084
+ "grad_norm": 6.159554481506348,
1085
+ "learning_rate": 5.956711739538289e-06,
1086
+ "loss": 2.8319,
1087
+ "step": 77000
1088
+ },
1089
+ {
1090
+ "epoch": 0.8865856727755279,
1091
+ "grad_norm": 6.287491798400879,
1092
+ "learning_rate": 5.6707163612236024e-06,
1093
+ "loss": 2.8622,
1094
+ "step": 77500
1095
+ },
1096
+ {
1097
+ "epoch": 0.8923055803418217,
1098
+ "grad_norm": 6.237947940826416,
1099
+ "learning_rate": 5.384720982908916e-06,
1100
+ "loss": 2.8539,
1101
+ "step": 78000
1102
+ },
1103
+ {
1104
+ "epoch": 0.8980254879081154,
1105
+ "grad_norm": 6.096075057983398,
1106
+ "learning_rate": 5.09872560459423e-06,
1107
+ "loss": 2.8256,
1108
+ "step": 78500
1109
+ },
1110
+ {
1111
+ "epoch": 0.9037453954744091,
1112
+ "grad_norm": 6.158285140991211,
1113
+ "learning_rate": 4.812730226279544e-06,
1114
+ "loss": 2.8476,
1115
+ "step": 79000
1116
+ },
1117
+ {
1118
+ "epoch": 0.9094653030407028,
1119
+ "grad_norm": 6.239354133605957,
1120
+ "learning_rate": 4.526734847964857e-06,
1121
+ "loss": 2.8538,
1122
+ "step": 79500
1123
+ },
1124
+ {
1125
+ "epoch": 0.9151852106069966,
1126
+ "grad_norm": 5.273146629333496,
1127
+ "learning_rate": 4.2407394696501705e-06,
1128
+ "loss": 2.849,
1129
+ "step": 80000
1130
+ },
1131
+ {
1132
+ "epoch": 0.9209051181732903,
1133
+ "grad_norm": 6.121274948120117,
1134
+ "learning_rate": 3.954744091335484e-06,
1135
+ "loss": 2.8454,
1136
+ "step": 80500
1137
+ },
1138
+ {
1139
+ "epoch": 0.926625025739584,
1140
+ "grad_norm": 6.213692665100098,
1141
+ "learning_rate": 3.6687487130207977e-06,
1142
+ "loss": 2.8431,
1143
+ "step": 81000
1144
+ },
1145
+ {
1146
+ "epoch": 0.9323449333058778,
1147
+ "grad_norm": 6.369662761688232,
1148
+ "learning_rate": 3.382753334706111e-06,
1149
+ "loss": 2.8589,
1150
+ "step": 81500
1151
+ },
1152
+ {
1153
+ "epoch": 0.9380648408721715,
1154
+ "grad_norm": 5.934210300445557,
1155
+ "learning_rate": 3.096757956391425e-06,
1156
+ "loss": 2.8371,
1157
+ "step": 82000
1158
+ },
1159
+ {
1160
+ "epoch": 0.9437847484384653,
1161
+ "grad_norm": 6.598156452178955,
1162
+ "learning_rate": 2.8107625780767385e-06,
1163
+ "loss": 2.8282,
1164
+ "step": 82500
1165
+ },
1166
+ {
1167
+ "epoch": 0.949504656004759,
1168
+ "grad_norm": 5.81234073638916,
1169
+ "learning_rate": 2.524767199762052e-06,
1170
+ "loss": 2.8244,
1171
+ "step": 83000
1172
+ },
1173
+ {
1174
+ "epoch": 0.9552245635710527,
1175
+ "grad_norm": 6.145198822021484,
1176
+ "learning_rate": 2.2387718214473658e-06,
1177
+ "loss": 2.8294,
1178
+ "step": 83500
1179
+ },
1180
+ {
1181
+ "epoch": 0.9609444711373464,
1182
+ "grad_norm": 5.321970462799072,
1183
+ "learning_rate": 1.952776443132679e-06,
1184
+ "loss": 2.8425,
1185
+ "step": 84000
1186
+ },
1187
+ {
1188
+ "epoch": 0.9666643787036402,
1189
+ "grad_norm": 5.8561248779296875,
1190
+ "learning_rate": 1.6667810648179926e-06,
1191
+ "loss": 2.8618,
1192
+ "step": 84500
1193
+ },
1194
+ {
1195
+ "epoch": 0.9723842862699339,
1196
+ "grad_norm": 6.001429080963135,
1197
+ "learning_rate": 1.3807856865033063e-06,
1198
+ "loss": 2.8456,
1199
+ "step": 85000
1200
+ },
1201
+ {
1202
+ "epoch": 0.9781041938362276,
1203
+ "grad_norm": 6.074833393096924,
1204
+ "learning_rate": 1.0947903081886197e-06,
1205
+ "loss": 2.8408,
1206
+ "step": 85500
1207
+ },
1208
+ {
1209
+ "epoch": 0.9838241014025213,
1210
+ "grad_norm": 5.579460144042969,
1211
+ "learning_rate": 8.087949298739332e-07,
1212
+ "loss": 2.8427,
1213
+ "step": 86000
1214
+ },
1215
+ {
1216
+ "epoch": 0.9895440089688151,
1217
+ "grad_norm": 5.968313217163086,
1218
+ "learning_rate": 5.227995515592468e-07,
1219
+ "loss": 2.8345,
1220
+ "step": 86500
1221
+ },
1222
+ {
1223
+ "epoch": 0.9952639165351088,
1224
+ "grad_norm": 5.920719146728516,
1225
+ "learning_rate": 2.3680417324456038e-07,
1226
+ "loss": 2.8337,
1227
+ "step": 87000
1228
+ },
1229
  {
1230
  "epoch": 1.0,
1231
+ "step": 87414,
1232
+ "total_flos": 4.634223291773338e+16,
1233
+ "train_loss": 3.132497888326312,
1234
+ "train_runtime": 20429.0413,
1235
+ "train_samples_per_second": 34.231,
1236
+ "train_steps_per_second": 4.279
1237
  }
1238
  ],
1239
  "logging_steps": 500,
1240
+ "max_steps": 87414,
1241
  "num_input_tokens_seen": 0,
1242
  "num_train_epochs": 1,
1243
  "save_steps": 500,
 
1253
  "attributes": {}
1254
  }
1255
  },
1256
+ "total_flos": 4.634223291773338e+16,
1257
  "train_batch_size": 8,
1258
  "trial_name": null,
1259
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1ebc3c8cf034541f337347c16a9572f2dada04919b0087a438aadaad09a5406
3
- size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c62c2584fefcb587403e55ceb246ccd105b535709a1ff9735da2465de367912
3
+ size 5368