yyx123 commited on
Commit
c1d3f12
1 Parent(s): a633086

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- datasets:
11
- - ruozhiba
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-ruozhiba-1e-5-50
@@ -20,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-ruozhiba-1e-5-50
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the ruozhiba dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 1.9875
26
 
27
  ## Model description
28
 
@@ -41,35 +37,25 @@ More information needed
41
  ### Training hyperparameters
42
 
43
  The following hyperparameters were used during training:
44
- - learning_rate: 1e-05
45
  - train_batch_size: 4
46
  - eval_batch_size: 4
47
  - seed: 42
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: cosine
50
  - lr_scheduler_warmup_ratio: 0.1
51
- - num_epochs: 20
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 1.8114 | 4.0 | 220 | 1.8505 |
58
- | 1.6723 | 5.0 | 275 | 1.8372 |
59
- | 1.6532 | 6.0 | 330 | 1.8296 |
60
- | 1.7187 | 7.0 | 385 | 1.8273 |
61
- | 1.6945 | 8.0 | 440 | 1.8345 |
62
- | 1.5494 | 9.0 | 495 | 1.8452 |
63
- | 1.5329 | 10.0 | 550 | 1.8665 |
64
- | 1.4105 | 11.0 | 605 | 1.8877 |
65
- | 1.3862 | 12.0 | 660 | 1.9066 |
66
- | 1.4126 | 13.0 | 715 | 1.9303 |
67
- | 1.388 | 14.0 | 770 | 1.9449 |
68
- | 1.3653 | 15.0 | 825 | 1.9637 |
69
- | 1.361 | 16.0 | 880 | 1.9738 |
70
- | 1.2944 | 17.0 | 935 | 1.9819 |
71
- | 1.3433 | 18.0 | 990 | 1.9856 |
72
- | 1.2058 | 19.0 | 1045 | 1.9871 |
73
 
74
 
75
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
8
  base_model: 01-ai/Yi-6B
9
  model-index:
10
  - name: Yi-6B-ruozhiba-1e-5-50
 
16
 
17
  # Yi-6B-ruozhiba-1e-5-50
18
 
19
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 3.4046
22
 
23
  ## Model description
24
 
 
37
  ### Training hyperparameters
38
 
39
  The following hyperparameters were used during training:
40
+ - learning_rate: 0.0005
41
  - train_batch_size: 4
42
  - eval_batch_size: 4
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.1
47
+ - num_epochs: 10
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 1.8713 | 1.0 | 55 | 1.8389 |
54
+ | 0.1631 | 5.0 | 275 | 2.9787 |
55
+ | 0.0765 | 6.0 | 330 | 3.0907 |
56
+ | 0.0489 | 7.0 | 385 | 3.2638 |
57
+ | 0.0559 | 8.0 | 440 | 3.3750 |
58
+ | 0.0424 | 9.0 | 495 | 3.4014 |
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  ### Framework versions
adapter_config.json CHANGED
@@ -19,13 +19,13 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "k_proj",
 
23
  "up_proj",
24
  "q_proj",
 
25
  "down_proj",
26
- "v_proj",
27
- "o_proj",
28
- "gate_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
+ "o_proj",
23
+ "gate_proj",
24
  "up_proj",
25
  "q_proj",
26
+ "k_proj",
27
  "down_proj",
28
+ "v_proj"
 
 
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_loss": 1.987522840499878,
4
- "eval_runtime": 6.7242,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 3.42,
7
- "eval_steps_per_second": 0.892,
8
  "train_loss": 0.0,
9
- "train_runtime": 12.2134,
10
  "train_samples": 217,
11
- "train_samples_per_second": 355.347,
12
- "train_steps_per_second": 90.065
13
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 3.40458345413208,
4
+ "eval_runtime": 7.4156,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 3.102,
7
+ "eval_steps_per_second": 0.809,
8
  "train_loss": 0.0,
9
+ "train_runtime": 16.9739,
10
  "train_samples": 217,
11
+ "train_samples_per_second": 127.843,
12
+ "train_steps_per_second": 32.403
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_loss": 1.987522840499878,
4
- "eval_runtime": 6.7242,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 3.42,
7
- "eval_steps_per_second": 0.892
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 3.40458345413208,
4
+ "eval_runtime": 7.4156,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 3.102,
7
+ "eval_steps_per_second": 0.809
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
  "train_loss": 0.0,
4
- "train_runtime": 12.2134,
5
  "train_samples": 217,
6
- "train_samples_per_second": 355.347,
7
- "train_steps_per_second": 90.065
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
  "train_loss": 0.0,
4
+ "train_runtime": 16.9739,
5
  "train_samples": 217,
6
+ "train_samples_per_second": 127.843,
7
+ "train_steps_per_second": 32.403
8
  }
trainer_state.json CHANGED
@@ -1,1973 +1,965 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 1100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
- "learning_rate": 9.090909090909091e-08,
14
  "loss": 2.3833,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.07,
19
- "learning_rate": 3.6363636363636366e-07,
20
- "loss": 2.4789,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.15,
25
- "learning_rate": 7.272727272727273e-07,
26
- "loss": 2.3195,
27
  "step": 8
28
  },
29
  {
30
  "epoch": 0.22,
31
- "learning_rate": 1.090909090909091e-06,
32
- "loss": 2.3366,
33
  "step": 12
34
  },
35
  {
36
  "epoch": 0.29,
37
- "learning_rate": 1.4545454545454546e-06,
38
- "loss": 2.3221,
39
  "step": 16
40
  },
41
  {
42
  "epoch": 0.36,
43
- "learning_rate": 1.8181818181818183e-06,
44
- "loss": 2.4036,
45
  "step": 20
46
  },
47
  {
48
  "epoch": 0.44,
49
- "learning_rate": 2.181818181818182e-06,
50
- "loss": 2.4224,
51
  "step": 24
52
  },
53
  {
54
  "epoch": 0.51,
55
- "learning_rate": 2.5454545454545456e-06,
56
- "loss": 2.6085,
57
  "step": 28
58
  },
59
  {
60
  "epoch": 0.58,
61
- "learning_rate": 2.9090909090909093e-06,
62
- "loss": 2.5477,
63
  "step": 32
64
  },
65
  {
66
  "epoch": 0.65,
67
- "learning_rate": 3.272727272727273e-06,
68
- "loss": 2.4446,
69
  "step": 36
70
  },
71
  {
72
  "epoch": 0.73,
73
- "learning_rate": 3.6363636363636366e-06,
74
- "loss": 2.3109,
75
  "step": 40
76
  },
77
  {
78
  "epoch": 0.8,
79
- "learning_rate": 4.000000000000001e-06,
80
- "loss": 2.4149,
81
  "step": 44
82
  },
83
  {
84
  "epoch": 0.87,
85
- "learning_rate": 4.363636363636364e-06,
86
- "loss": 2.5514,
87
  "step": 48
88
  },
89
  {
90
  "epoch": 0.95,
91
- "learning_rate": 4.727272727272728e-06,
92
- "loss": 2.3816,
93
  "step": 52
94
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  {
96
  "epoch": 1.02,
97
- "learning_rate": 5.090909090909091e-06,
98
- "loss": 2.6293,
99
  "step": 56
100
  },
101
  {
102
  "epoch": 1.09,
103
- "learning_rate": 5.4545454545454545e-06,
104
- "loss": 2.2422,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 1.16,
109
- "learning_rate": 5.8181818181818185e-06,
110
- "loss": 2.4031,
111
  "step": 64
112
  },
113
  {
114
  "epoch": 1.24,
115
- "learning_rate": 6.181818181818182e-06,
116
- "loss": 2.2303,
117
  "step": 68
118
  },
119
  {
120
  "epoch": 1.31,
121
- "learning_rate": 6.545454545454546e-06,
122
- "loss": 2.2847,
123
  "step": 72
124
  },
125
  {
126
  "epoch": 1.38,
127
- "learning_rate": 6.90909090909091e-06,
128
- "loss": 2.1578,
129
  "step": 76
130
  },
131
  {
132
  "epoch": 1.45,
133
- "learning_rate": 7.272727272727273e-06,
134
- "loss": 2.1774,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 1.53,
139
- "learning_rate": 7.636363636363638e-06,
140
- "loss": 2.197,
141
  "step": 84
142
  },
143
  {
144
  "epoch": 1.6,
145
- "learning_rate": 8.000000000000001e-06,
146
- "loss": 2.2093,
147
  "step": 88
148
  },
149
  {
150
  "epoch": 1.67,
151
- "learning_rate": 8.363636363636365e-06,
152
- "loss": 2.1004,
153
  "step": 92
154
  },
155
  {
156
  "epoch": 1.75,
157
- "learning_rate": 8.727272727272728e-06,
158
- "loss": 2.0526,
159
  "step": 96
160
  },
161
  {
162
  "epoch": 1.82,
163
- "learning_rate": 9.090909090909091e-06,
164
- "loss": 2.0771,
165
  "step": 100
166
  },
167
  {
168
  "epoch": 1.89,
169
- "learning_rate": 9.454545454545456e-06,
170
- "loss": 2.0219,
171
  "step": 104
172
  },
173
  {
174
  "epoch": 1.96,
175
- "learning_rate": 9.81818181818182e-06,
176
- "loss": 1.9439,
177
  "step": 108
178
  },
179
  {
180
  "epoch": 2.04,
181
- "learning_rate": 9.999899300364534e-06,
182
- "loss": 1.9175,
183
  "step": 112
184
  },
185
  {
186
  "epoch": 2.11,
187
- "learning_rate": 9.99909372761763e-06,
188
- "loss": 1.8281,
189
  "step": 116
190
  },
191
  {
192
  "epoch": 2.18,
193
- "learning_rate": 9.997482711915926e-06,
194
- "loss": 1.9423,
195
  "step": 120
196
  },
197
  {
198
  "epoch": 2.25,
199
- "learning_rate": 9.99506651282272e-06,
200
- "loss": 1.9385,
201
  "step": 124
202
  },
203
  {
204
  "epoch": 2.33,
205
- "learning_rate": 9.991845519630679e-06,
206
- "loss": 2.0381,
207
  "step": 128
208
  },
209
  {
210
  "epoch": 2.4,
211
- "learning_rate": 9.987820251299121e-06,
212
- "loss": 2.0296,
213
  "step": 132
214
  },
215
  {
216
  "epoch": 2.47,
217
- "learning_rate": 9.982991356370404e-06,
218
- "loss": 1.9167,
219
  "step": 136
220
  },
221
  {
222
  "epoch": 2.55,
223
- "learning_rate": 9.977359612865424e-06,
224
- "loss": 1.9132,
225
  "step": 140
226
  },
227
  {
228
  "epoch": 2.62,
229
- "learning_rate": 9.970925928158275e-06,
230
- "loss": 1.8455,
231
  "step": 144
232
  },
233
  {
234
  "epoch": 2.69,
235
- "learning_rate": 9.963691338830045e-06,
236
- "loss": 1.8692,
237
  "step": 148
238
  },
239
  {
240
  "epoch": 2.76,
241
- "learning_rate": 9.955657010501807e-06,
242
- "loss": 1.8812,
243
  "step": 152
244
  },
245
  {
246
  "epoch": 2.84,
247
- "learning_rate": 9.946824237646823e-06,
248
- "loss": 1.9785,
249
  "step": 156
250
  },
251
  {
252
  "epoch": 2.91,
253
- "learning_rate": 9.937194443381972e-06,
254
- "loss": 1.9511,
255
  "step": 160
256
  },
257
  {
258
  "epoch": 2.98,
259
- "learning_rate": 9.926769179238467e-06,
260
- "loss": 1.8505,
261
  "step": 164
262
  },
263
  {
264
  "epoch": 3.05,
265
- "learning_rate": 9.915550124911866e-06,
266
- "loss": 1.9195,
267
  "step": 168
268
  },
269
  {
270
  "epoch": 3.13,
271
- "learning_rate": 9.903539087991462e-06,
272
- "loss": 1.8527,
273
  "step": 172
274
  },
275
  {
276
  "epoch": 3.2,
277
- "learning_rate": 9.890738003669029e-06,
278
- "loss": 1.7579,
279
  "step": 176
280
  },
281
  {
282
  "epoch": 3.27,
283
- "learning_rate": 9.877148934427037e-06,
284
- "loss": 1.9244,
285
  "step": 180
286
  },
287
  {
288
  "epoch": 3.35,
289
- "learning_rate": 9.862774069706346e-06,
290
- "loss": 1.8144,
291
  "step": 184
292
  },
293
  {
294
  "epoch": 3.42,
295
- "learning_rate": 9.847615725553457e-06,
296
- "loss": 1.8462,
297
  "step": 188
298
  },
299
  {
300
  "epoch": 3.49,
301
- "learning_rate": 9.831676344247343e-06,
302
- "loss": 1.7888,
303
  "step": 192
304
  },
305
  {
306
  "epoch": 3.56,
307
- "learning_rate": 9.814958493905962e-06,
308
- "loss": 1.9216,
309
  "step": 196
310
  },
311
  {
312
  "epoch": 3.64,
313
- "learning_rate": 9.797464868072489e-06,
314
- "loss": 1.7411,
315
  "step": 200
316
  },
317
  {
318
  "epoch": 3.71,
319
- "learning_rate": 9.779198285281326e-06,
320
- "loss": 1.7989,
321
  "step": 204
322
  },
323
  {
324
  "epoch": 3.78,
325
- "learning_rate": 9.760161688604008e-06,
326
- "loss": 1.8476,
327
  "step": 208
328
  },
329
  {
330
  "epoch": 3.85,
331
- "learning_rate": 9.740358145174999e-06,
332
- "loss": 1.7767,
333
  "step": 212
334
  },
335
  {
336
  "epoch": 3.93,
337
- "learning_rate": 9.719790845697534e-06,
338
- "loss": 1.7036,
339
  "step": 216
340
  },
341
  {
342
  "epoch": 4.0,
343
- "learning_rate": 9.698463103929542e-06,
344
- "loss": 1.8114,
345
- "step": 220
346
- },
347
- {
348
- "epoch": 4.0,
349
- "gpt4_scores": 0.65,
350
- "step": 220
351
- },
352
- {
353
- "epoch": 4.0,
354
- "std": 0.11335784048754634,
355
- "step": 220
356
- },
357
- {
358
- "epoch": 4.0,
359
- "eval_loss": 1.8505451679229736,
360
- "eval_runtime": 4.9659,
361
- "eval_samples_per_second": 4.632,
362
- "eval_steps_per_second": 1.208,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 4.07,
367
- "learning_rate": 9.676378356149733e-06,
368
- "loss": 1.7635,
369
  "step": 224
370
  },
371
  {
372
  "epoch": 4.15,
373
- "learning_rate": 9.653540160603956e-06,
374
- "loss": 1.5819,
375
  "step": 228
376
  },
377
  {
378
  "epoch": 4.22,
379
- "learning_rate": 9.629952196931902e-06,
380
- "loss": 1.8023,
381
  "step": 232
382
  },
383
  {
384
  "epoch": 4.29,
385
- "learning_rate": 9.60561826557425e-06,
386
- "loss": 1.7447,
387
  "step": 236
388
  },
389
  {
390
  "epoch": 4.36,
391
- "learning_rate": 9.580542287160348e-06,
392
- "loss": 1.7547,
393
  "step": 240
394
  },
395
  {
396
  "epoch": 4.44,
397
- "learning_rate": 9.554728301876525e-06,
398
- "loss": 1.8657,
399
  "step": 244
400
  },
401
  {
402
  "epoch": 4.51,
403
- "learning_rate": 9.528180468815155e-06,
404
- "loss": 1.7641,
405
  "step": 248
406
  },
407
  {
408
  "epoch": 4.58,
409
- "learning_rate": 9.50090306530454e-06,
410
- "loss": 1.7088,
411
  "step": 252
412
  },
413
  {
414
  "epoch": 4.65,
415
- "learning_rate": 9.47290048621977e-06,
416
- "loss": 1.8404,
417
  "step": 256
418
  },
419
  {
420
  "epoch": 4.73,
421
- "learning_rate": 9.444177243274619e-06,
422
- "loss": 1.7781,
423
  "step": 260
424
  },
425
  {
426
  "epoch": 4.8,
427
- "learning_rate": 9.414737964294636e-06,
428
- "loss": 1.804,
429
  "step": 264
430
  },
431
  {
432
  "epoch": 4.87,
433
- "learning_rate": 9.384587392471516e-06,
434
- "loss": 1.8061,
435
  "step": 268
436
  },
437
  {
438
  "epoch": 4.95,
439
- "learning_rate": 9.353730385598887e-06,
440
- "loss": 1.6723,
441
  "step": 272
442
  },
443
  {
444
  "epoch": 5.0,
445
- "gpt4_scores": 0.73,
446
  "step": 275
447
  },
448
  {
449
  "epoch": 5.0,
450
- "std": 0.10959014554237986,
451
  "step": 275
452
  },
453
  {
454
  "epoch": 5.0,
455
- "eval_loss": 1.8371697664260864,
456
- "eval_runtime": 4.9576,
457
- "eval_samples_per_second": 4.639,
458
- "eval_steps_per_second": 1.21,
459
  "step": 275
460
  },
461
  {
462
  "epoch": 5.02,
463
- "learning_rate": 9.322171915289635e-06,
464
- "loss": 1.7486,
465
  "step": 276
466
  },
467
  {
468
  "epoch": 5.09,
469
- "learning_rate": 9.289917066174887e-06,
470
- "loss": 1.7675,
471
  "step": 280
472
  },
473
  {
474
  "epoch": 5.16,
475
- "learning_rate": 9.256971035084786e-06,
476
- "loss": 1.7188,
477
  "step": 284
478
  },
479
  {
480
  "epoch": 5.24,
481
- "learning_rate": 9.223339130211194e-06,
482
- "loss": 1.7628,
483
  "step": 288
484
  },
485
  {
486
  "epoch": 5.31,
487
- "learning_rate": 9.189026770252437e-06,
488
- "loss": 1.6931,
489
  "step": 292
490
  },
491
  {
492
  "epoch": 5.38,
493
- "learning_rate": 9.154039483540273e-06,
494
- "loss": 1.5731,
495
  "step": 296
496
  },
497
  {
498
  "epoch": 5.45,
499
- "learning_rate": 9.118382907149164e-06,
500
- "loss": 1.7763,
501
  "step": 300
502
  },
503
  {
504
  "epoch": 5.53,
505
- "learning_rate": 9.08206278598805e-06,
506
- "loss": 1.7601,
507
  "step": 304
508
  },
509
  {
510
  "epoch": 5.6,
511
- "learning_rate": 9.045084971874738e-06,
512
- "loss": 1.6893,
513
  "step": 308
514
  },
515
  {
516
  "epoch": 5.67,
517
- "learning_rate": 9.007455422593077e-06,
518
- "loss": 1.7848,
519
  "step": 312
520
  },
521
  {
522
  "epoch": 5.75,
523
- "learning_rate": 8.969180200933048e-06,
524
- "loss": 1.7517,
525
  "step": 316
526
  },
527
  {
528
  "epoch": 5.82,
529
- "learning_rate": 8.930265473713939e-06,
530
- "loss": 1.6602,
531
  "step": 320
532
  },
533
  {
534
  "epoch": 5.89,
535
- "learning_rate": 8.890717510790763e-06,
536
- "loss": 1.6003,
537
  "step": 324
538
  },
539
  {
540
  "epoch": 5.96,
541
- "learning_rate": 8.850542684044078e-06,
542
- "loss": 1.6532,
543
  "step": 328
544
  },
545
  {
546
  "epoch": 6.0,
547
- "gpt4_scores": 0.65,
548
  "step": 330
549
  },
550
  {
551
  "epoch": 6.0,
552
- "std": 0.1298075498574717,
553
  "step": 330
554
  },
555
  {
556
  "epoch": 6.0,
557
- "eval_loss": 1.8295962810516357,
558
- "eval_runtime": 4.9535,
559
- "eval_samples_per_second": 4.643,
560
- "eval_steps_per_second": 1.211,
561
  "step": 330
562
  },
563
  {
564
  "epoch": 6.04,
565
- "learning_rate": 8.809747466353356e-06,
566
- "loss": 1.597,
567
  "step": 332
568
  },
569
  {
570
  "epoch": 6.11,
571
- "learning_rate": 8.768338430554083e-06,
572
- "loss": 1.7247,
573
  "step": 336
574
  },
575
  {
576
  "epoch": 6.18,
577
- "learning_rate": 8.726322248378775e-06,
578
- "loss": 1.7433,
579
  "step": 340
580
  },
581
  {
582
  "epoch": 6.25,
583
- "learning_rate": 8.683705689382025e-06,
584
- "loss": 1.6082,
585
  "step": 344
586
  },
587
  {
588
  "epoch": 6.33,
589
- "learning_rate": 8.640495619849821e-06,
590
- "loss": 1.6855,
591
  "step": 348
592
  },
593
  {
594
  "epoch": 6.4,
595
- "learning_rate": 8.596699001693257e-06,
596
- "loss": 1.5826,
597
  "step": 352
598
  },
599
  {
600
  "epoch": 6.47,
601
- "learning_rate": 8.552322891326846e-06,
602
- "loss": 1.6749,
603
  "step": 356
604
  },
605
  {
606
  "epoch": 6.55,
607
- "learning_rate": 8.507374438531606e-06,
608
- "loss": 1.649,
609
  "step": 360
610
  },
611
  {
612
  "epoch": 6.62,
613
- "learning_rate": 8.461860885303116e-06,
614
- "loss": 1.6959,
615
  "step": 364
616
  },
617
  {
618
  "epoch": 6.69,
619
- "learning_rate": 8.415789564684673e-06,
620
- "loss": 1.6805,
621
  "step": 368
622
  },
623
  {
624
  "epoch": 6.76,
625
- "learning_rate": 8.36916789958584e-06,
626
- "loss": 1.67,
627
  "step": 372
628
  },
629
  {
630
  "epoch": 6.84,
631
- "learning_rate": 8.322003401586463e-06,
632
- "loss": 1.6226,
633
  "step": 376
634
  },
635
  {
636
  "epoch": 6.91,
637
- "learning_rate": 8.274303669726427e-06,
638
- "loss": 1.6525,
639
  "step": 380
640
  },
641
  {
642
  "epoch": 6.98,
643
- "learning_rate": 8.226076389281316e-06,
644
- "loss": 1.7187,
645
  "step": 384
646
  },
647
  {
648
  "epoch": 7.0,
649
- "gpt4_scores": 0.74,
650
  "step": 385
651
  },
652
  {
653
  "epoch": 7.0,
654
- "std": 0.11153474794878948,
655
  "step": 385
656
  },
657
  {
658
  "epoch": 7.0,
659
- "eval_loss": 1.8273346424102783,
660
- "eval_runtime": 4.923,
661
- "eval_samples_per_second": 4.672,
662
- "eval_steps_per_second": 1.219,
663
  "step": 385
664
  },
665
  {
666
  "epoch": 7.05,
667
- "learning_rate": 8.177329330524182e-06,
668
- "loss": 1.6103,
669
  "step": 388
670
  },
671
  {
672
  "epoch": 7.13,
673
- "learning_rate": 8.128070347473609e-06,
674
- "loss": 1.546,
675
  "step": 392
676
  },
677
  {
678
  "epoch": 7.2,
679
- "learning_rate": 8.078307376628292e-06,
680
- "loss": 1.551,
681
  "step": 396
682
  },
683
  {
684
  "epoch": 7.27,
685
- "learning_rate": 8.028048435688333e-06,
686
- "loss": 1.6379,
687
  "step": 400
688
  },
689
  {
690
  "epoch": 7.35,
691
- "learning_rate": 7.97730162226344e-06,
692
- "loss": 1.61,
693
  "step": 404
694
  },
695
  {
696
  "epoch": 7.42,
697
- "learning_rate": 7.92607511256826e-06,
698
- "loss": 1.6151,
699
  "step": 408
700
  },
701
  {
702
  "epoch": 7.49,
703
- "learning_rate": 7.874377160105037e-06,
704
- "loss": 1.6989,
705
  "step": 412
706
  },
707
  {
708
  "epoch": 7.56,
709
- "learning_rate": 7.822216094333847e-06,
710
- "loss": 1.6711,
711
  "step": 416
712
  },
713
  {
714
  "epoch": 7.64,
715
- "learning_rate": 7.769600319330553e-06,
716
- "loss": 1.6344,
717
  "step": 420
718
  },
719
  {
720
  "epoch": 7.71,
721
- "learning_rate": 7.716538312432767e-06,
722
- "loss": 1.6,
723
  "step": 424
724
  },
725
  {
726
  "epoch": 7.78,
727
- "learning_rate": 7.663038622873999e-06,
728
- "loss": 1.6563,
729
  "step": 428
730
  },
731
  {
732
  "epoch": 7.85,
733
- "learning_rate": 7.60910987040623e-06,
734
- "loss": 1.5933,
735
  "step": 432
736
  },
737
  {
738
  "epoch": 7.93,
739
- "learning_rate": 7.554760743911104e-06,
740
- "loss": 1.6445,
741
  "step": 436
742
  },
743
  {
744
  "epoch": 8.0,
745
- "learning_rate": 7.500000000000001e-06,
746
- "loss": 1.6945,
747
  "step": 440
748
  },
749
  {
750
  "epoch": 8.0,
751
- "gpt4_scores": 0.71,
752
  "step": 440
753
  },
754
  {
755
  "epoch": 8.0,
756
- "std": 0.10904127658827183,
757
  "step": 440
758
  },
759
  {
760
  "epoch": 8.0,
761
- "eval_loss": 1.8345035314559937,
762
- "eval_runtime": 4.9576,
763
- "eval_samples_per_second": 4.639,
764
- "eval_steps_per_second": 1.21,
765
  "step": 440
766
  },
767
  {
768
  "epoch": 8.07,
769
- "learning_rate": 7.444836461603195e-06,
770
- "loss": 1.6089,
771
  "step": 444
772
  },
773
  {
774
  "epoch": 8.15,
775
- "learning_rate": 7.3892790165483164e-06,
776
- "loss": 1.6294,
777
  "step": 448
778
  },
779
  {
780
  "epoch": 8.22,
781
- "learning_rate": 7.333336616128369e-06,
782
- "loss": 1.6516,
783
  "step": 452
784
  },
785
  {
786
  "epoch": 8.29,
787
- "learning_rate": 7.2770182736595164e-06,
788
- "loss": 1.4944,
789
  "step": 456
790
  },
791
  {
792
  "epoch": 8.36,
793
- "learning_rate": 7.2203330630288714e-06,
794
- "loss": 1.5007,
795
  "step": 460
796
  },
797
  {
798
  "epoch": 8.44,
799
- "learning_rate": 7.163290117232542e-06,
800
- "loss": 1.5207,
801
  "step": 464
802
  },
803
  {
804
  "epoch": 8.51,
805
- "learning_rate": 7.105898626904134e-06,
806
- "loss": 1.6485,
807
  "step": 468
808
  },
809
  {
810
  "epoch": 8.58,
811
- "learning_rate": 7.048167838833977e-06,
812
- "loss": 1.554,
813
  "step": 472
814
  },
815
  {
816
  "epoch": 8.65,
817
- "learning_rate": 6.990107054479313e-06,
818
- "loss": 1.5661,
819
  "step": 476
820
  },
821
  {
822
  "epoch": 8.73,
823
- "learning_rate": 6.931725628465643e-06,
824
- "loss": 1.496,
825
  "step": 480
826
  },
827
  {
828
  "epoch": 8.8,
829
- "learning_rate": 6.873032967079562e-06,
830
- "loss": 1.554,
831
  "step": 484
832
  },
833
  {
834
  "epoch": 8.87,
835
- "learning_rate": 6.814038526753205e-06,
836
- "loss": 1.574,
837
  "step": 488
838
  },
839
  {
840
  "epoch": 8.95,
841
- "learning_rate": 6.75475181254068e-06,
842
- "loss": 1.5494,
843
  "step": 492
844
  },
845
  {
846
  "epoch": 9.0,
847
- "gpt4_scores": 0.5700000000000001,
848
  "step": 495
849
  },
850
  {
851
  "epoch": 9.0,
852
- "std": 0.13787675656179324,
853
  "step": 495
854
  },
855
  {
856
  "epoch": 9.0,
857
- "eval_loss": 1.8451542854309082,
858
- "eval_runtime": 4.9243,
859
- "eval_samples_per_second": 4.671,
860
- "eval_steps_per_second": 1.218,
861
  "step": 495
862
  },
863
  {
864
  "epoch": 9.02,
865
- "learning_rate": 6.695182376586603e-06,
866
- "loss": 1.6572,
867
  "step": 496
868
  },
869
  {
870
  "epoch": 9.09,
871
- "learning_rate": 6.635339816587109e-06,
872
- "loss": 1.5122,
873
  "step": 500
874
  },
875
  {
876
  "epoch": 9.16,
877
- "learning_rate": 6.5752337742434644e-06,
878
- "loss": 1.4632,
879
  "step": 504
880
  },
881
  {
882
  "epoch": 9.24,
883
- "learning_rate": 6.514873933708637e-06,
884
- "loss": 1.4757,
885
  "step": 508
886
  },
887
  {
888
  "epoch": 9.31,
889
- "learning_rate": 6.454270020026996e-06,
890
- "loss": 1.5465,
891
  "step": 512
892
  },
893
  {
894
  "epoch": 9.38,
895
- "learning_rate": 6.39343179756744e-06,
896
- "loss": 1.5713,
897
  "step": 516
898
  },
899
  {
900
  "epoch": 9.45,
901
- "learning_rate": 6.332369068450175e-06,
902
- "loss": 1.5246,
903
  "step": 520
904
  },
905
  {
906
  "epoch": 9.53,
907
- "learning_rate": 6.271091670967437e-06,
908
- "loss": 1.478,
909
  "step": 524
910
  },
911
  {
912
  "epoch": 9.6,
913
- "learning_rate": 6.209609477998339e-06,
914
- "loss": 1.5593,
915
  "step": 528
916
  },
917
  {
918
  "epoch": 9.67,
919
- "learning_rate": 6.1479323954182055e-06,
920
- "loss": 1.5553,
921
  "step": 532
922
  },
923
  {
924
  "epoch": 9.75,
925
- "learning_rate": 6.08607036050254e-06,
926
- "loss": 1.5434,
927
  "step": 536
928
  },
929
  {
930
  "epoch": 9.82,
931
- "learning_rate": 6.024033340325954e-06,
932
- "loss": 1.4686,
933
  "step": 540
934
  },
935
  {
936
  "epoch": 9.89,
937
- "learning_rate": 5.961831330156306e-06,
938
- "loss": 1.4344,
939
  "step": 544
940
  },
941
  {
942
  "epoch": 9.96,
943
- "learning_rate": 5.89947435184427e-06,
944
- "loss": 1.5329,
945
  "step": 548
946
  },
947
  {
948
  "epoch": 10.0,
949
- "gpt4_scores": 0.6,
950
- "step": 550
951
- },
952
- {
953
- "epoch": 10.0,
954
- "std": 0.14352700094407322,
955
- "step": 550
956
- },
957
- {
958
- "epoch": 10.0,
959
- "eval_loss": 1.8664953708648682,
960
- "eval_runtime": 4.9654,
961
- "eval_samples_per_second": 4.632,
962
- "eval_steps_per_second": 1.208,
963
- "step": 550
964
- },
965
- {
966
- "epoch": 10.04,
967
- "learning_rate": 5.8369724522086545e-06,
968
- "loss": 1.5753,
969
- "step": 552
970
- },
971
- {
972
- "epoch": 10.11,
973
- "learning_rate": 5.774335701417662e-06,
974
- "loss": 1.4623,
975
- "step": 556
976
- },
977
- {
978
- "epoch": 10.18,
979
- "learning_rate": 5.711574191366427e-06,
980
- "loss": 1.4908,
981
- "step": 560
982
- },
983
- {
984
- "epoch": 10.25,
985
- "learning_rate": 5.648698034051009e-06,
986
- "loss": 1.4846,
987
- "step": 564
988
- },
989
- {
990
- "epoch": 10.33,
991
- "learning_rate": 5.585717359939192e-06,
992
- "loss": 1.4591,
993
- "step": 568
994
- },
995
- {
996
- "epoch": 10.4,
997
- "learning_rate": 5.522642316338268e-06,
998
- "loss": 1.4508,
999
- "step": 572
1000
- },
1001
- {
1002
- "epoch": 10.47,
1003
- "learning_rate": 5.459483065760138e-06,
1004
- "loss": 1.4282,
1005
- "step": 576
1006
- },
1007
- {
1008
- "epoch": 10.55,
1009
- "learning_rate": 5.396249784283943e-06,
1010
- "loss": 1.4565,
1011
- "step": 580
1012
- },
1013
- {
1014
- "epoch": 10.62,
1015
- "learning_rate": 5.33295265991652e-06,
1016
- "loss": 1.5174,
1017
- "step": 584
1018
- },
1019
- {
1020
- "epoch": 10.69,
1021
- "learning_rate": 5.26960189095093e-06,
1022
- "loss": 1.4063,
1023
- "step": 588
1024
- },
1025
- {
1026
- "epoch": 10.76,
1027
- "learning_rate": 5.206207684323337e-06,
1028
- "loss": 1.4412,
1029
- "step": 592
1030
- },
1031
- {
1032
- "epoch": 10.84,
1033
- "learning_rate": 5.142780253968481e-06,
1034
- "loss": 1.5706,
1035
- "step": 596
1036
- },
1037
- {
1038
- "epoch": 10.91,
1039
- "learning_rate": 5.07932981917404e-06,
1040
- "loss": 1.4704,
1041
- "step": 600
1042
- },
1043
- {
1044
- "epoch": 10.98,
1045
- "learning_rate": 5.015866602934112e-06,
1046
- "loss": 1.4105,
1047
- "step": 604
1048
- },
1049
- {
1050
- "epoch": 11.0,
1051
- "gpt4_scores": 0.5700000000000001,
1052
- "step": 605
1053
- },
1054
- {
1055
- "epoch": 11.0,
1056
- "std": 0.12653062870309306,
1057
- "step": 605
1058
- },
1059
- {
1060
- "epoch": 11.0,
1061
- "eval_loss": 1.8876867294311523,
1062
- "eval_runtime": 4.9571,
1063
- "eval_samples_per_second": 4.64,
1064
- "eval_steps_per_second": 1.21,
1065
- "step": 605
1066
- },
1067
- {
1068
- "epoch": 11.05,
1069
- "learning_rate": 4.952400830302117e-06,
1070
- "loss": 1.3259,
1071
- "step": 608
1072
- },
1073
- {
1074
- "epoch": 11.13,
1075
- "learning_rate": 4.888942726743353e-06,
1076
- "loss": 1.371,
1077
- "step": 612
1078
- },
1079
- {
1080
- "epoch": 11.2,
1081
- "learning_rate": 4.825502516487497e-06,
1082
- "loss": 1.4825,
1083
- "step": 616
1084
- },
1085
- {
1086
- "epoch": 11.27,
1087
- "learning_rate": 4.762090420881289e-06,
1088
- "loss": 1.3511,
1089
- "step": 620
1090
- },
1091
- {
1092
- "epoch": 11.35,
1093
- "learning_rate": 4.6987166567417085e-06,
1094
- "loss": 1.4314,
1095
- "step": 624
1096
- },
1097
- {
1098
- "epoch": 11.42,
1099
- "learning_rate": 4.635391434709847e-06,
1100
- "loss": 1.4736,
1101
- "step": 628
1102
- },
1103
- {
1104
- "epoch": 11.49,
1105
- "learning_rate": 4.572124957605803e-06,
1106
- "loss": 1.4653,
1107
- "step": 632
1108
- },
1109
- {
1110
- "epoch": 11.56,
1111
- "learning_rate": 4.5089274187848144e-06,
1112
- "loss": 1.4053,
1113
- "step": 636
1114
- },
1115
- {
1116
- "epoch": 11.64,
1117
- "learning_rate": 4.445809000494945e-06,
1118
- "loss": 1.4959,
1119
- "step": 640
1120
- },
1121
- {
1122
- "epoch": 11.71,
1123
- "learning_rate": 4.382779872236527e-06,
1124
- "loss": 1.4142,
1125
- "step": 644
1126
- },
1127
- {
1128
- "epoch": 11.78,
1129
- "learning_rate": 4.319850189123681e-06,
1130
- "loss": 1.3764,
1131
- "step": 648
1132
- },
1133
- {
1134
- "epoch": 11.85,
1135
- "learning_rate": 4.257030090248142e-06,
1136
- "loss": 1.5262,
1137
- "step": 652
1138
- },
1139
- {
1140
- "epoch": 11.93,
1141
- "learning_rate": 4.194329697045681e-06,
1142
- "loss": 1.4114,
1143
- "step": 656
1144
- },
1145
- {
1146
- "epoch": 12.0,
1147
- "learning_rate": 4.131759111665349e-06,
1148
- "loss": 1.3862,
1149
- "step": 660
1150
- },
1151
- {
1152
- "epoch": 12.0,
1153
- "gpt4_scores": 0.64,
1154
- "step": 660
1155
- },
1156
- {
1157
- "epoch": 12.0,
1158
- "std": 0.13505554412907303,
1159
- "step": 660
1160
- },
1161
- {
1162
- "epoch": 12.0,
1163
- "eval_loss": 1.9065581560134888,
1164
- "eval_runtime": 4.9613,
1165
- "eval_samples_per_second": 4.636,
1166
- "eval_steps_per_second": 1.209,
1167
- "step": 660
1168
- },
1169
- {
1170
- "epoch": 12.07,
1171
- "learning_rate": 4.06932841534185e-06,
1172
- "loss": 1.4282,
1173
- "step": 664
1174
- },
1175
- {
1176
- "epoch": 12.15,
1177
- "learning_rate": 4.007047666771274e-06,
1178
- "loss": 1.3369,
1179
- "step": 668
1180
- },
1181
- {
1182
- "epoch": 12.22,
1183
- "learning_rate": 3.944926900490452e-06,
1184
- "loss": 1.4198,
1185
- "step": 672
1186
- },
1187
- {
1188
- "epoch": 12.29,
1189
- "learning_rate": 3.882976125260229e-06,
1190
- "loss": 1.3799,
1191
- "step": 676
1192
- },
1193
- {
1194
- "epoch": 12.36,
1195
- "learning_rate": 3.821205322452863e-06,
1196
- "loss": 1.4461,
1197
- "step": 680
1198
- },
1199
- {
1200
- "epoch": 12.44,
1201
- "learning_rate": 3.7596244444438577e-06,
1202
- "loss": 1.4154,
1203
- "step": 684
1204
- },
1205
- {
1206
- "epoch": 12.51,
1207
- "learning_rate": 3.69824341300844e-06,
1208
- "loss": 1.3719,
1209
- "step": 688
1210
- },
1211
- {
1212
- "epoch": 12.58,
1213
- "learning_rate": 3.637072117723012e-06,
1214
- "loss": 1.3079,
1215
- "step": 692
1216
- },
1217
- {
1218
- "epoch": 12.65,
1219
- "learning_rate": 3.5761204143717387e-06,
1220
- "loss": 1.3786,
1221
- "step": 696
1222
- },
1223
- {
1224
- "epoch": 12.73,
1225
- "learning_rate": 3.5153981233586277e-06,
1226
- "loss": 1.3406,
1227
- "step": 700
1228
- },
1229
- {
1230
- "epoch": 12.8,
1231
- "learning_rate": 3.4549150281252635e-06,
1232
- "loss": 1.3967,
1233
- "step": 704
1234
- },
1235
- {
1236
- "epoch": 12.87,
1237
- "learning_rate": 3.394680873574546e-06,
1238
- "loss": 1.4914,
1239
- "step": 708
1240
- },
1241
- {
1242
- "epoch": 12.95,
1243
- "learning_rate": 3.3347053645005965e-06,
1244
- "loss": 1.4126,
1245
- "step": 712
1246
- },
1247
- {
1248
- "epoch": 13.0,
1249
- "gpt4_scores": 0.53,
1250
- "step": 715
1251
- },
1252
- {
1253
- "epoch": 13.0,
1254
- "std": 0.14356183336806477,
1255
- "step": 715
1256
- },
1257
- {
1258
- "epoch": 13.0,
1259
- "eval_loss": 1.9303113222122192,
1260
- "eval_runtime": 4.9701,
1261
- "eval_samples_per_second": 4.628,
1262
- "eval_steps_per_second": 1.207,
1263
- "step": 715
1264
- },
1265
- {
1266
- "epoch": 13.02,
1267
- "learning_rate": 3.274998164025148e-06,
1268
- "loss": 1.4012,
1269
- "step": 716
1270
- },
1271
- {
1272
- "epoch": 13.09,
1273
- "learning_rate": 3.2155688920406415e-06,
1274
- "loss": 1.4258,
1275
- "step": 720
1276
- },
1277
- {
1278
- "epoch": 13.16,
1279
- "learning_rate": 3.156427123660297e-06,
1280
- "loss": 1.3023,
1281
- "step": 724
1282
- },
1283
- {
1284
- "epoch": 13.24,
1285
- "learning_rate": 3.097582387675385e-06,
1286
- "loss": 1.3797,
1287
- "step": 728
1288
- },
1289
- {
1290
- "epoch": 13.31,
1291
- "learning_rate": 3.0390441650199727e-06,
1292
- "loss": 1.335,
1293
- "step": 732
1294
- },
1295
- {
1296
- "epoch": 13.38,
1297
- "learning_rate": 2.980821887243377e-06,
1298
- "loss": 1.388,
1299
- "step": 736
1300
- },
1301
- {
1302
- "epoch": 13.45,
1303
- "learning_rate": 2.9229249349905686e-06,
1304
- "loss": 1.2886,
1305
- "step": 740
1306
- },
1307
- {
1308
- "epoch": 13.53,
1309
- "learning_rate": 2.8653626364907918e-06,
1310
- "loss": 1.3144,
1311
- "step": 744
1312
- },
1313
- {
1314
- "epoch": 13.6,
1315
- "learning_rate": 2.8081442660546126e-06,
1316
- "loss": 1.3285,
1317
- "step": 748
1318
- },
1319
- {
1320
- "epoch": 13.67,
1321
- "learning_rate": 2.751279042579672e-06,
1322
- "loss": 1.4061,
1323
- "step": 752
1324
- },
1325
- {
1326
- "epoch": 13.75,
1327
- "learning_rate": 2.694776128065345e-06,
1328
- "loss": 1.3877,
1329
- "step": 756
1330
- },
1331
- {
1332
- "epoch": 13.82,
1333
- "learning_rate": 2.6386446261365874e-06,
1334
- "loss": 1.2318,
1335
- "step": 760
1336
- },
1337
- {
1338
- "epoch": 13.89,
1339
- "learning_rate": 2.5828935805771804e-06,
1340
- "loss": 1.3826,
1341
- "step": 764
1342
- },
1343
- {
1344
- "epoch": 13.96,
1345
- "learning_rate": 2.527531973872617e-06,
1346
- "loss": 1.388,
1347
- "step": 768
1348
- },
1349
- {
1350
- "epoch": 14.0,
1351
- "gpt4_scores": 0.62,
1352
- "step": 770
1353
- },
1354
- {
1355
- "epoch": 14.0,
1356
- "std": 0.1362350909274112,
1357
- "step": 770
1358
- },
1359
- {
1360
- "epoch": 14.0,
1361
- "eval_loss": 1.9448539018630981,
1362
- "eval_runtime": 4.9425,
1363
- "eval_samples_per_second": 4.654,
1364
- "eval_steps_per_second": 1.214,
1365
- "step": 770
1366
- },
1367
- {
1368
- "epoch": 14.04,
1369
- "learning_rate": 2.4725687257628533e-06,
1370
- "loss": 1.524,
1371
- "step": 772
1372
- },
1373
- {
1374
- "epoch": 14.11,
1375
- "learning_rate": 2.418012691805191e-06,
1376
- "loss": 1.3961,
1377
- "step": 776
1378
- },
1379
- {
1380
- "epoch": 14.18,
1381
- "learning_rate": 2.363872661947488e-06,
1382
- "loss": 1.3303,
1383
- "step": 780
1384
- },
1385
- {
1386
- "epoch": 14.25,
1387
- "learning_rate": 2.310157359111938e-06,
1388
- "loss": 1.3642,
1389
- "step": 784
1390
- },
1391
- {
1392
- "epoch": 14.33,
1393
- "learning_rate": 2.2568754377896516e-06,
1394
- "loss": 1.3532,
1395
- "step": 788
1396
- },
1397
- {
1398
- "epoch": 14.4,
1399
- "learning_rate": 2.204035482646267e-06,
1400
- "loss": 1.2528,
1401
- "step": 792
1402
- },
1403
- {
1404
- "epoch": 14.47,
1405
- "learning_rate": 2.1516460071388062e-06,
1406
- "loss": 1.3888,
1407
- "step": 796
1408
- },
1409
- {
1410
- "epoch": 14.55,
1411
- "learning_rate": 2.09971545214401e-06,
1412
- "loss": 1.269,
1413
- "step": 800
1414
- },
1415
- {
1416
- "epoch": 14.62,
1417
- "learning_rate": 2.0482521845983522e-06,
1418
- "loss": 1.3103,
1419
- "step": 804
1420
- },
1421
- {
1422
- "epoch": 14.69,
1423
- "learning_rate": 1.9972644961499853e-06,
1424
- "loss": 1.3151,
1425
- "step": 808
1426
- },
1427
- {
1428
- "epoch": 14.76,
1429
- "learning_rate": 1.946760601822809e-06,
1430
- "loss": 1.3055,
1431
- "step": 812
1432
- },
1433
- {
1434
- "epoch": 14.84,
1435
- "learning_rate": 1.8967486386928819e-06,
1436
- "loss": 1.3414,
1437
- "step": 816
1438
- },
1439
- {
1440
- "epoch": 14.91,
1441
- "learning_rate": 1.8472366645773892e-06,
1442
- "loss": 1.3481,
1443
- "step": 820
1444
- },
1445
- {
1446
- "epoch": 14.98,
1447
- "learning_rate": 1.798232656736389e-06,
1448
- "loss": 1.3653,
1449
- "step": 824
1450
- },
1451
- {
1452
- "epoch": 15.0,
1453
- "gpt4_scores": 0.605,
1454
- "step": 825
1455
- },
1456
- {
1457
- "epoch": 15.0,
1458
- "std": 0.13312588027877975,
1459
- "step": 825
1460
- },
1461
- {
1462
- "epoch": 15.0,
1463
- "eval_loss": 1.9636751413345337,
1464
- "eval_runtime": 4.9484,
1465
- "eval_samples_per_second": 4.648,
1466
- "eval_steps_per_second": 1.213,
1467
- "step": 825
1468
- },
1469
- {
1470
- "epoch": 15.05,
1471
- "learning_rate": 1.7497445105875377e-06,
1472
- "loss": 1.4389,
1473
- "step": 828
1474
- },
1475
- {
1476
- "epoch": 15.13,
1477
- "learning_rate": 1.7017800384339928e-06,
1478
- "loss": 1.3568,
1479
- "step": 832
1480
- },
1481
- {
1482
- "epoch": 15.2,
1483
- "learning_rate": 1.6543469682057105e-06,
1484
- "loss": 1.3535,
1485
- "step": 836
1486
- },
1487
- {
1488
- "epoch": 15.27,
1489
- "learning_rate": 1.6074529422143398e-06,
1490
- "loss": 1.2696,
1491
- "step": 840
1492
- },
1493
- {
1494
- "epoch": 15.35,
1495
- "learning_rate": 1.561105515921915e-06,
1496
- "loss": 1.2854,
1497
- "step": 844
1498
- },
1499
- {
1500
- "epoch": 15.42,
1501
- "learning_rate": 1.5153121567235334e-06,
1502
- "loss": 1.2779,
1503
- "step": 848
1504
- },
1505
- {
1506
- "epoch": 15.49,
1507
- "learning_rate": 1.470080242744218e-06,
1508
- "loss": 1.3024,
1509
- "step": 852
1510
- },
1511
- {
1512
- "epoch": 15.56,
1513
- "learning_rate": 1.4254170616501828e-06,
1514
- "loss": 1.3081,
1515
- "step": 856
1516
- },
1517
- {
1518
- "epoch": 15.64,
1519
- "learning_rate": 1.3813298094746491e-06,
1520
- "loss": 1.2898,
1521
- "step": 860
1522
- },
1523
- {
1524
- "epoch": 15.71,
1525
- "learning_rate": 1.3378255894584463e-06,
1526
- "loss": 1.2944,
1527
- "step": 864
1528
- },
1529
- {
1530
- "epoch": 15.78,
1531
- "learning_rate": 1.2949114109055417e-06,
1532
- "loss": 1.3638,
1533
- "step": 868
1534
- },
1535
- {
1536
- "epoch": 15.85,
1537
- "learning_rate": 1.2525941880537307e-06,
1538
- "loss": 1.3272,
1539
- "step": 872
1540
- },
1541
- {
1542
- "epoch": 15.93,
1543
- "learning_rate": 1.210880738960616e-06,
1544
- "loss": 1.2646,
1545
- "step": 876
1546
- },
1547
- {
1548
- "epoch": 16.0,
1549
- "learning_rate": 1.1697777844051105e-06,
1550
- "loss": 1.361,
1551
- "step": 880
1552
- },
1553
- {
1554
- "epoch": 16.0,
1555
- "gpt4_scores": 0.5999999999999999,
1556
- "step": 880
1557
- },
1558
- {
1559
- "epoch": 16.0,
1560
- "std": 0.1319090595827292,
1561
- "step": 880
1562
- },
1563
- {
1564
- "epoch": 16.0,
1565
- "eval_loss": 1.9738112688064575,
1566
- "eval_runtime": 4.9552,
1567
- "eval_samples_per_second": 4.642,
1568
- "eval_steps_per_second": 1.211,
1569
- "step": 880
1570
- },
1571
- {
1572
- "epoch": 16.07,
1573
- "learning_rate": 1.1292919468045876e-06,
1574
- "loss": 1.2656,
1575
- "step": 884
1576
- },
1577
- {
1578
- "epoch": 16.15,
1579
- "learning_rate": 1.0894297491479044e-06,
1580
- "loss": 1.376,
1581
- "step": 888
1582
- },
1583
- {
1584
- "epoch": 16.22,
1585
- "learning_rate": 1.0501976139444191e-06,
1586
- "loss": 1.2771,
1587
- "step": 892
1588
- },
1589
- {
1590
- "epoch": 16.29,
1591
- "learning_rate": 1.0116018621892237e-06,
1592
- "loss": 1.2919,
1593
- "step": 896
1594
- },
1595
- {
1596
- "epoch": 16.36,
1597
- "learning_rate": 9.73648712344707e-07,
1598
- "loss": 1.3542,
1599
- "step": 900
1600
- },
1601
- {
1602
- "epoch": 16.44,
1603
- "learning_rate": 9.363442793386606e-07,
1604
- "loss": 1.275,
1605
- "step": 904
1606
- },
1607
- {
1608
- "epoch": 16.51,
1609
- "learning_rate": 8.996945735790447e-07,
1610
- "loss": 1.3197,
1611
- "step": 908
1612
- },
1613
- {
1614
- "epoch": 16.58,
1615
- "learning_rate": 8.637054999856148e-07,
1616
- "loss": 1.3272,
1617
- "step": 912
1618
- },
1619
- {
1620
- "epoch": 16.65,
1621
- "learning_rate": 8.283828570385239e-07,
1622
- "loss": 1.2585,
1623
- "step": 916
1624
- },
1625
- {
1626
- "epoch": 16.73,
1627
- "learning_rate": 7.937323358440935e-07,
1628
- "loss": 1.2851,
1629
- "step": 920
1630
- },
1631
- {
1632
- "epoch": 16.8,
1633
- "learning_rate": 7.597595192178702e-07,
1634
- "loss": 1.2787,
1635
- "step": 924
1636
- },
1637
- {
1638
- "epoch": 16.87,
1639
- "learning_rate": 7.264698807851328e-07,
1640
- "loss": 1.3594,
1641
- "step": 928
1642
- },
1643
- {
1644
- "epoch": 16.95,
1645
- "learning_rate": 6.938687840989972e-07,
1646
- "loss": 1.2944,
1647
- "step": 932
1648
- },
1649
- {
1650
- "epoch": 17.0,
1651
- "gpt4_scores": 0.6100000000000001,
1652
- "step": 935
1653
- },
1654
- {
1655
- "epoch": 17.0,
1656
- "std": 0.1352405264704334,
1657
- "step": 935
1658
- },
1659
- {
1660
- "epoch": 17.0,
1661
- "eval_loss": 1.9819345474243164,
1662
- "eval_runtime": 4.9634,
1663
- "eval_samples_per_second": 4.634,
1664
- "eval_steps_per_second": 1.209,
1665
- "step": 935
1666
- },
1667
- {
1668
- "epoch": 17.02,
1669
- "learning_rate": 6.619614817762537e-07,
1670
- "loss": 1.3019,
1671
- "step": 936
1672
- },
1673
- {
1674
- "epoch": 17.09,
1675
- "learning_rate": 6.307531146510754e-07,
1676
- "loss": 1.2263,
1677
- "step": 940
1678
- },
1679
- {
1680
- "epoch": 17.16,
1681
- "learning_rate": 6.002487109467347e-07,
1682
- "loss": 1.2774,
1683
- "step": 944
1684
- },
1685
- {
1686
- "epoch": 17.24,
1687
- "learning_rate": 5.704531854654721e-07,
1688
- "loss": 1.3117,
1689
- "step": 948
1690
- },
1691
- {
1692
- "epoch": 17.31,
1693
- "learning_rate": 5.413713387966329e-07,
1694
- "loss": 1.3669,
1695
- "step": 952
1696
- },
1697
- {
1698
- "epoch": 17.38,
1699
- "learning_rate": 5.130078565432089e-07,
1700
- "loss": 1.3741,
1701
- "step": 956
1702
- },
1703
- {
1704
- "epoch": 17.45,
1705
- "learning_rate": 4.853673085668947e-07,
1706
- "loss": 1.259,
1707
- "step": 960
1708
- },
1709
- {
1710
- "epoch": 17.53,
1711
- "learning_rate": 4.58454148251814e-07,
1712
- "loss": 1.2952,
1713
- "step": 964
1714
- },
1715
- {
1716
- "epoch": 17.6,
1717
- "learning_rate": 4.322727117869951e-07,
1718
- "loss": 1.2957,
1719
- "step": 968
1720
- },
1721
- {
1722
- "epoch": 17.67,
1723
- "learning_rate": 4.0682721746773346e-07,
1724
- "loss": 1.2796,
1725
- "step": 972
1726
- },
1727
- {
1728
- "epoch": 17.75,
1729
- "learning_rate": 3.821217650159453e-07,
1730
- "loss": 1.3251,
1731
- "step": 976
1732
- },
1733
- {
1734
- "epoch": 17.82,
1735
- "learning_rate": 3.581603349196372e-07,
1736
- "loss": 1.2795,
1737
- "step": 980
1738
- },
1739
- {
1740
- "epoch": 17.89,
1741
- "learning_rate": 3.3494678779157464e-07,
1742
- "loss": 1.2337,
1743
- "step": 984
1744
- },
1745
- {
1746
- "epoch": 17.96,
1747
- "learning_rate": 3.1248486374726884e-07,
1748
- "loss": 1.3433,
1749
- "step": 988
1750
- },
1751
- {
1752
- "epoch": 18.0,
1753
- "gpt4_scores": 0.5599999999999999,
1754
- "step": 990
1755
- },
1756
- {
1757
- "epoch": 18.0,
1758
- "std": 0.14573949361789343,
1759
- "step": 990
1760
- },
1761
- {
1762
- "epoch": 18.0,
1763
- "eval_loss": 1.9856066703796387,
1764
- "eval_runtime": 4.9567,
1765
- "eval_samples_per_second": 4.64,
1766
- "eval_steps_per_second": 1.21,
1767
- "step": 990
1768
- },
1769
- {
1770
- "epoch": 18.04,
1771
- "learning_rate": 2.9077818180237693e-07,
1772
- "loss": 1.2889,
1773
- "step": 992
1774
- },
1775
- {
1776
- "epoch": 18.11,
1777
- "learning_rate": 2.6983023928961406e-07,
1778
- "loss": 1.3571,
1779
- "step": 996
1780
- },
1781
- {
1782
- "epoch": 18.18,
1783
- "learning_rate": 2.4964441129527337e-07,
1784
- "loss": 1.2855,
1785
- "step": 1000
1786
- },
1787
- {
1788
- "epoch": 18.25,
1789
- "learning_rate": 2.3022395011543687e-07,
1790
- "loss": 1.2844,
1791
- "step": 1004
1792
- },
1793
- {
1794
- "epoch": 18.33,
1795
- "learning_rate": 2.1157198473197417e-07,
1796
- "loss": 1.3131,
1797
- "step": 1008
1798
- },
1799
- {
1800
- "epoch": 18.4,
1801
- "learning_rate": 1.9369152030840553e-07,
1802
- "loss": 1.2825,
1803
- "step": 1012
1804
- },
1805
- {
1806
- "epoch": 18.47,
1807
- "learning_rate": 1.765854377057219e-07,
1808
- "loss": 1.2378,
1809
- "step": 1016
1810
- },
1811
- {
1812
- "epoch": 18.55,
1813
- "learning_rate": 1.6025649301821877e-07,
1814
- "loss": 1.3067,
1815
- "step": 1020
1816
- },
1817
- {
1818
- "epoch": 18.62,
1819
- "learning_rate": 1.4470731712944885e-07,
1820
- "loss": 1.367,
1821
- "step": 1024
1822
- },
1823
- {
1824
- "epoch": 18.69,
1825
- "learning_rate": 1.2994041528833267e-07,
1826
- "loss": 1.2553,
1827
- "step": 1028
1828
- },
1829
- {
1830
- "epoch": 18.76,
1831
- "learning_rate": 1.1595816670552429e-07,
1832
- "loss": 1.392,
1833
- "step": 1032
1834
- },
1835
- {
1836
- "epoch": 18.84,
1837
- "learning_rate": 1.0276282417007399e-07,
1838
- "loss": 1.2524,
1839
- "step": 1036
1840
- },
1841
- {
1842
- "epoch": 18.91,
1843
- "learning_rate": 9.035651368646647e-08,
1844
- "loss": 1.2067,
1845
- "step": 1040
1846
- },
1847
- {
1848
- "epoch": 18.98,
1849
- "learning_rate": 7.874123413208145e-08,
1850
- "loss": 1.2058,
1851
- "step": 1044
1852
- },
1853
- {
1854
- "epoch": 19.0,
1855
- "gpt4_scores": 0.5399999999999999,
1856
- "step": 1045
1857
- },
1858
- {
1859
- "epoch": 19.0,
1860
- "std": 0.1422673539502299,
1861
- "step": 1045
1862
- },
1863
- {
1864
- "epoch": 19.0,
1865
- "eval_loss": 1.9871299266815186,
1866
- "eval_runtime": 4.9631,
1867
- "eval_samples_per_second": 4.634,
1868
- "eval_steps_per_second": 1.209,
1869
- "step": 1045
1870
- },
1871
- {
1872
- "epoch": 19.05,
1873
- "learning_rate": 6.791885693514134e-08,
1874
- "loss": 1.2248,
1875
- "step": 1048
1876
- },
1877
- {
1878
- "epoch": 19.13,
1879
- "learning_rate": 5.7891125773187896e-08,
1880
- "loss": 1.3235,
1881
- "step": 1052
1882
- },
1883
- {
1884
- "epoch": 19.2,
1885
- "learning_rate": 4.865965629214819e-08,
1886
- "loss": 1.337,
1887
- "step": 1056
1888
- },
1889
- {
1890
- "epoch": 19.27,
1891
- "learning_rate": 4.02259358460233e-08,
1892
- "loss": 1.2373,
1893
- "step": 1060
1894
- },
1895
- {
1896
- "epoch": 19.35,
1897
- "learning_rate": 3.25913232572489e-08,
1898
- "loss": 1.2713,
1899
- "step": 1064
1900
- },
1901
- {
1902
- "epoch": 19.42,
1903
- "learning_rate": 2.57570485977654e-08,
1904
- "loss": 1.2856,
1905
- "step": 1068
1906
- },
1907
- {
1908
- "epoch": 19.49,
1909
- "learning_rate": 1.9724212990830938e-08,
1910
- "loss": 1.3416,
1911
- "step": 1072
1912
- },
1913
- {
1914
- "epoch": 19.56,
1915
- "learning_rate": 1.449378843361271e-08,
1916
- "loss": 1.2418,
1917
- "step": 1076
1918
- },
1919
- {
1920
- "epoch": 19.64,
1921
- "learning_rate": 1.006661764057837e-08,
1922
- "loss": 1.2665,
1923
- "step": 1080
1924
- },
1925
- {
1926
- "epoch": 19.71,
1927
- "learning_rate": 6.4434139077201865e-09,
1928
- "loss": 1.2837,
1929
- "step": 1084
1930
- },
1931
- {
1932
- "epoch": 19.78,
1933
- "learning_rate": 3.6247609976319818e-09,
1934
- "loss": 1.2405,
1935
- "step": 1088
1936
- },
1937
- {
1938
- "epoch": 19.85,
1939
- "learning_rate": 1.61111304545436e-09,
1940
- "loss": 1.3201,
1941
- "step": 1092
1942
- },
1943
- {
1944
- "epoch": 19.93,
1945
- "learning_rate": 4.027944857032395e-10,
1946
- "loss": 1.37,
1947
- "step": 1096
1948
- },
1949
- {
1950
- "epoch": 20.0,
1951
- "learning_rate": 0.0,
1952
- "loss": 1.2904,
1953
- "step": 1100
1954
- },
1955
- {
1956
- "epoch": 20.0,
1957
- "step": 1100,
1958
- "total_flos": 3.792475205866291e+16,
1959
  "train_loss": 0.0,
1960
- "train_runtime": 12.2134,
1961
- "train_samples_per_second": 355.347,
1962
- "train_steps_per_second": 90.065
1963
  }
1964
  ],
1965
  "logging_steps": 4,
1966
- "max_steps": 1100,
1967
  "num_input_tokens_seen": 0,
1968
- "num_train_epochs": 20,
1969
  "save_steps": 55,
1970
- "total_flos": 3.792475205866291e+16,
1971
  "train_batch_size": 4,
1972
  "trial_name": null,
1973
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
+ "learning_rate": 9.090909090909091e-06,
14
  "loss": 2.3833,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.07,
19
+ "learning_rate": 3.6363636363636364e-05,
20
+ "loss": 2.4676,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.15,
25
+ "learning_rate": 7.272727272727273e-05,
26
+ "loss": 2.2207,
27
  "step": 8
28
  },
29
  {
30
  "epoch": 0.22,
31
+ "learning_rate": 0.00010909090909090909,
32
+ "loss": 2.1377,
33
  "step": 12
34
  },
35
  {
36
  "epoch": 0.29,
37
+ "learning_rate": 0.00014545454545454546,
38
+ "loss": 1.9925,
39
  "step": 16
40
  },
41
  {
42
  "epoch": 0.36,
43
+ "learning_rate": 0.00018181818181818183,
44
+ "loss": 1.9958,
45
  "step": 20
46
  },
47
  {
48
  "epoch": 0.44,
49
+ "learning_rate": 0.00021818181818181818,
50
+ "loss": 1.8572,
51
  "step": 24
52
  },
53
  {
54
  "epoch": 0.51,
55
+ "learning_rate": 0.0002545454545454545,
56
+ "loss": 1.9631,
57
  "step": 28
58
  },
59
  {
60
  "epoch": 0.58,
61
+ "learning_rate": 0.0002909090909090909,
62
+ "loss": 1.8855,
63
  "step": 32
64
  },
65
  {
66
  "epoch": 0.65,
67
+ "learning_rate": 0.00032727272727272726,
68
+ "loss": 1.8563,
69
  "step": 36
70
  },
71
  {
72
  "epoch": 0.73,
73
+ "learning_rate": 0.00036363636363636367,
74
+ "loss": 1.77,
75
  "step": 40
76
  },
77
  {
78
  "epoch": 0.8,
79
+ "learning_rate": 0.0004,
80
+ "loss": 1.8378,
81
  "step": 44
82
  },
83
  {
84
  "epoch": 0.87,
85
+ "learning_rate": 0.00043636363636363637,
86
+ "loss": 1.9476,
87
  "step": 48
88
  },
89
  {
90
  "epoch": 0.95,
91
+ "learning_rate": 0.0004727272727272727,
92
+ "loss": 1.8713,
93
  "step": 52
94
  },
95
+ {
96
+ "epoch": 1.0,
97
+ "gpt4_scores": 0.5359999999999999,
98
+ "step": 55
99
+ },
100
+ {
101
+ "epoch": 1.0,
102
+ "std": 0.0580868315541483,
103
+ "step": 55
104
+ },
105
+ {
106
+ "epoch": 1.0,
107
+ "eval_loss": 1.8388524055480957,
108
+ "eval_runtime": 4.9686,
109
+ "eval_samples_per_second": 4.629,
110
+ "eval_steps_per_second": 1.208,
111
+ "step": 55
112
+ },
113
  {
114
  "epoch": 1.02,
115
+ "learning_rate": 0.0004999949650182266,
116
+ "loss": 1.7168,
117
  "step": 56
118
  },
119
  {
120
  "epoch": 1.09,
121
+ "learning_rate": 0.0004998741355957963,
122
+ "loss": 1.6773,
123
  "step": 60
124
  },
125
  {
126
  "epoch": 1.16,
127
+ "learning_rate": 0.0004995922759815339,
128
+ "loss": 1.6162,
129
  "step": 64
130
  },
131
  {
132
  "epoch": 1.24,
133
+ "learning_rate": 0.0004991495678185201,
134
+ "loss": 1.5625,
135
  "step": 68
136
  },
137
  {
138
  "epoch": 1.31,
139
+ "learning_rate": 0.0004985462964079136,
140
+ "loss": 1.4995,
141
  "step": 72
142
  },
143
  {
144
  "epoch": 1.38,
145
+ "learning_rate": 0.0004977828505250904,
146
+ "loss": 1.4863,
147
  "step": 76
148
  },
149
  {
150
  "epoch": 1.45,
151
+ "learning_rate": 0.0004968597221690986,
152
+ "loss": 1.4958,
153
  "step": 80
154
  },
155
  {
156
  "epoch": 1.53,
157
+ "learning_rate": 0.0004957775062455933,
158
+ "loss": 1.6001,
159
  "step": 84
160
  },
161
  {
162
  "epoch": 1.6,
163
+ "learning_rate": 0.0004945369001834514,
164
+ "loss": 1.5667,
165
  "step": 88
166
  },
167
  {
168
  "epoch": 1.67,
169
+ "learning_rate": 0.0004931387034853173,
170
+ "loss": 1.4828,
171
  "step": 92
172
  },
173
  {
174
  "epoch": 1.75,
175
+ "learning_rate": 0.0004915838172123671,
176
+ "loss": 1.4508,
177
  "step": 96
178
  },
179
  {
180
  "epoch": 1.82,
181
+ "learning_rate": 0.0004898732434036243,
182
+ "loss": 1.4932,
183
  "step": 100
184
  },
185
  {
186
  "epoch": 1.89,
187
+ "learning_rate": 0.0004880080844302004,
188
+ "loss": 1.4789,
189
  "step": 104
190
  },
191
  {
192
  "epoch": 1.96,
193
+ "learning_rate": 0.0004859895422848767,
194
+ "loss": 1.3297,
195
  "step": 108
196
  },
197
  {
198
  "epoch": 2.04,
199
+ "learning_rate": 0.00048381891780748665,
200
+ "loss": 1.0117,
201
  "step": 112
202
  },
203
  {
204
  "epoch": 2.11,
205
+ "learning_rate": 0.0004814976098465951,
206
+ "loss": 0.7652,
207
  "step": 116
208
  },
209
  {
210
  "epoch": 2.18,
211
+ "learning_rate": 0.0004790271143580174,
212
+ "loss": 0.8033,
213
  "step": 120
214
  },
215
  {
216
  "epoch": 2.25,
217
+ "learning_rate": 0.0004764090234407577,
218
+ "loss": 0.6905,
219
  "step": 124
220
  },
221
  {
222
  "epoch": 2.33,
223
+ "learning_rate": 0.0004736450243109884,
224
+ "loss": 0.7312,
225
  "step": 128
226
  },
227
  {
228
  "epoch": 2.4,
229
+ "learning_rate": 0.00047073689821473173,
230
+ "loss": 0.761,
231
  "step": 132
232
  },
233
  {
234
  "epoch": 2.47,
235
+ "learning_rate": 0.00046768651927994433,
236
+ "loss": 0.6884,
237
  "step": 136
238
  },
239
  {
240
  "epoch": 2.55,
241
+ "learning_rate": 0.0004644958533087443,
242
+ "loss": 0.623,
243
  "step": 140
244
  },
245
  {
246
  "epoch": 2.62,
247
+ "learning_rate": 0.0004611669565105596,
248
+ "loss": 0.7938,
249
  "step": 144
250
  },
251
  {
252
  "epoch": 2.69,
253
+ "learning_rate": 0.00045770197417701366,
254
+ "loss": 0.7256,
255
  "step": 148
256
  },
257
  {
258
  "epoch": 2.76,
259
+ "learning_rate": 0.00045410313929940244,
260
+ "loss": 0.8236,
261
  "step": 152
262
  },
263
  {
264
  "epoch": 2.84,
265
+ "learning_rate": 0.00045037277112965383,
266
+ "loss": 0.9053,
267
  "step": 156
268
  },
269
  {
270
  "epoch": 2.91,
271
+ "learning_rate": 0.0004465132736856969,
272
+ "loss": 0.8822,
273
  "step": 160
274
  },
275
  {
276
  "epoch": 2.98,
277
+ "learning_rate": 0.00044252713420220394,
278
+ "loss": 0.847,
279
  "step": 164
280
  },
281
  {
282
  "epoch": 3.05,
283
+ "learning_rate": 0.00043841692152770415,
284
+ "loss": 0.3486,
285
  "step": 168
286
  },
287
  {
288
  "epoch": 3.13,
289
+ "learning_rate": 0.00043418528446910123,
290
+ "loss": 0.3387,
291
  "step": 172
292
  },
293
  {
294
  "epoch": 3.2,
295
+ "learning_rate": 0.0004298349500846628,
296
+ "loss": 0.3827,
297
  "step": 176
298
  },
299
  {
300
  "epoch": 3.27,
301
+ "learning_rate": 0.00042536872192658034,
302
+ "loss": 0.3494,
303
  "step": 180
304
  },
305
  {
306
  "epoch": 3.35,
307
+ "learning_rate": 0.00042078947823423365,
308
+ "loss": 0.3773,
309
  "step": 184
310
  },
311
  {
312
  "epoch": 3.42,
313
+ "learning_rate": 0.0004161001700793231,
314
+ "loss": 0.327,
315
  "step": 188
316
  },
317
  {
318
  "epoch": 3.49,
319
+ "learning_rate": 0.00041130381946406574,
320
+ "loss": 0.4188,
321
  "step": 192
322
  },
323
  {
324
  "epoch": 3.56,
325
+ "learning_rate": 0.0004064035173736804,
326
+ "loss": 0.3763,
327
  "step": 196
328
  },
329
  {
330
  "epoch": 3.64,
331
+ "learning_rate": 0.00040140242178441667,
332
+ "loss": 0.3837,
333
  "step": 200
334
  },
335
  {
336
  "epoch": 3.71,
337
+ "learning_rate": 0.0003963037556284129,
338
+ "loss": 0.4003,
339
  "step": 204
340
  },
341
  {
342
  "epoch": 3.78,
343
+ "learning_rate": 0.0003911108047166924,
344
+ "loss": 0.32,
345
  "step": 208
346
  },
347
  {
348
  "epoch": 3.85,
349
+ "learning_rate": 0.00038582691562163827,
350
+ "loss": 0.3738,
351
  "step": 212
352
  },
353
  {
354
  "epoch": 3.93,
355
+ "learning_rate": 0.0003804554935203115,
356
+ "loss": 0.3584,
357
  "step": 216
358
  },
359
  {
360
  "epoch": 4.0,
361
+ "learning_rate": 0.000375,
362
+ "loss": 0.3469,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  "step": 220
364
  },
365
  {
366
  "epoch": 4.07,
367
+ "learning_rate": 0.0003694639508274158,
368
+ "loss": 0.1782,
369
  "step": 224
370
  },
371
  {
372
  "epoch": 4.15,
373
+ "learning_rate": 0.0003638509136829758,
374
+ "loss": 0.1614,
375
  "step": 228
376
  },
377
  {
378
  "epoch": 4.22,
379
+ "learning_rate": 0.00035816450586162706,
380
+ "loss": 0.2127,
381
  "step": 232
382
  },
383
  {
384
  "epoch": 4.29,
385
+ "learning_rate": 0.00035240839194169884,
386
+ "loss": 0.173,
387
  "step": 236
388
  },
389
  {
390
  "epoch": 4.36,
391
+ "learning_rate": 0.00034658628142328216,
392
+ "loss": 0.1831,
393
  "step": 240
394
  },
395
  {
396
  "epoch": 4.44,
397
+ "learning_rate": 0.00034070192633766023,
398
+ "loss": 0.2235,
399
  "step": 244
400
  },
401
  {
402
  "epoch": 4.51,
403
+ "learning_rate": 0.0003347591188293301,
404
+ "loss": 0.1765,
405
  "step": 248
406
  },
407
  {
408
  "epoch": 4.58,
409
+ "learning_rate": 0.00032876168871217323,
410
+ "loss": 0.1828,
411
  "step": 252
412
  },
413
  {
414
  "epoch": 4.65,
415
+ "learning_rate": 0.00032271350100134975,
416
+ "loss": 0.1773,
417
  "step": 256
418
  },
419
  {
420
  "epoch": 4.73,
421
+ "learning_rate": 0.0003166184534225087,
422
+ "loss": 0.1496,
423
  "step": 260
424
  },
425
  {
426
  "epoch": 4.8,
427
+ "learning_rate": 0.0003104804738999169,
428
+ "loss": 0.176,
429
  "step": 264
430
  },
431
  {
432
  "epoch": 4.87,
433
+ "learning_rate": 0.00030430351802512693,
434
+ "loss": 0.1545,
435
  "step": 268
436
  },
437
  {
438
  "epoch": 4.95,
439
+ "learning_rate": 0.00029809156650781527,
440
+ "loss": 0.1631,
441
  "step": 272
442
  },
443
  {
444
  "epoch": 5.0,
445
+ "gpt4_scores": 0.6274,
446
  "step": 275
447
  },
448
  {
449
  "epoch": 5.0,
450
+ "std": 0.04891773502524415,
451
  "step": 275
452
  },
453
  {
454
  "epoch": 5.0,
455
+ "eval_loss": 2.9787437915802,
456
+ "eval_runtime": 4.9451,
457
+ "eval_samples_per_second": 4.651,
458
+ "eval_steps_per_second": 1.213,
459
  "step": 275
460
  },
461
  {
462
  "epoch": 5.02,
463
+ "learning_rate": 0.0002918486226104327,
464
+ "loss": 0.123,
465
  "step": 276
466
  },
467
  {
468
  "epoch": 5.09,
469
+ "learning_rate": 0.00028557870956832135,
470
+ "loss": 0.1251,
471
  "step": 280
472
  },
473
  {
474
  "epoch": 5.16,
475
+ "learning_rate": 0.0002792858679969596,
476
+ "loss": 0.0895,
477
  "step": 284
478
  },
479
  {
480
  "epoch": 5.24,
481
+ "learning_rate": 0.0002729741532880069,
482
+ "loss": 0.0952,
483
  "step": 288
484
  },
485
  {
486
  "epoch": 5.31,
487
+ "learning_rate": 0.000266647632995826,
488
+ "loss": 0.0914,
489
  "step": 292
490
  },
491
  {
492
  "epoch": 5.38,
493
+ "learning_rate": 0.00026031038421616684,
494
+ "loss": 0.1204,
495
  "step": 296
496
  },
497
  {
498
  "epoch": 5.45,
499
+ "learning_rate": 0.000253966490958702,
500
+ "loss": 0.1004,
501
  "step": 300
502
  },
503
  {
504
  "epoch": 5.53,
505
+ "learning_rate": 0.00024762004151510585,
506
+ "loss": 0.1251,
507
  "step": 304
508
  },
509
  {
510
  "epoch": 5.6,
511
+ "learning_rate": 0.00024127512582437484,
512
+ "loss": 0.0999,
513
  "step": 308
514
  },
515
  {
516
  "epoch": 5.67,
517
+ "learning_rate": 0.00023493583283708543,
518
+ "loss": 0.0932,
519
  "step": 312
520
  },
521
  {
522
  "epoch": 5.75,
523
+ "learning_rate": 0.00022860624788029015,
524
+ "loss": 0.0786,
525
  "step": 316
526
  },
527
  {
528
  "epoch": 5.82,
529
+ "learning_rate": 0.00022229045002474727,
530
+ "loss": 0.0984,
531
  "step": 320
532
  },
533
  {
534
  "epoch": 5.89,
535
+ "learning_rate": 0.000215992509456184,
536
+ "loss": 0.0769,
537
  "step": 324
538
  },
539
  {
540
  "epoch": 5.96,
541
+ "learning_rate": 0.000209716484852284,
542
+ "loss": 0.0765,
543
  "step": 328
544
  },
545
  {
546
  "epoch": 6.0,
547
+ "gpt4_scores": 0.5669999999999998,
548
  "step": 330
549
  },
550
  {
551
  "epoch": 6.0,
552
+ "std": 0.05915420526048846,
553
  "step": 330
554
  },
555
  {
556
  "epoch": 6.0,
557
+ "eval_loss": 3.0907390117645264,
558
+ "eval_runtime": 4.9453,
559
+ "eval_samples_per_second": 4.651,
560
+ "eval_steps_per_second": 1.213,
561
  "step": 330
562
  },
563
  {
564
  "epoch": 6.04,
565
+ "learning_rate": 0.0002034664207670925,
566
+ "loss": 0.0632,
567
  "step": 332
568
  },
569
  {
570
  "epoch": 6.11,
571
+ "learning_rate": 0.0001972463450245226,
572
+ "loss": 0.0566,
573
  "step": 336
574
  },
575
  {
576
  "epoch": 6.18,
577
+ "learning_rate": 0.00019106026612264316,
578
+ "loss": 0.054,
579
  "step": 340
580
  },
581
  {
582
  "epoch": 6.25,
583
+ "learning_rate": 0.00018491217065042198,
584
+ "loss": 0.0482,
585
  "step": 344
586
  },
587
  {
588
  "epoch": 6.33,
589
+ "learning_rate": 0.00017880602071858692,
590
+ "loss": 0.0482,
591
  "step": 348
592
  },
593
  {
594
  "epoch": 6.4,
595
+ "learning_rate": 0.00017274575140626317,
596
+ "loss": 0.0575,
597
  "step": 352
598
  },
599
  {
600
  "epoch": 6.47,
601
+ "learning_rate": 0.00016673526822502983,
602
+ "loss": 0.0506,
603
  "step": 356
604
  },
605
  {
606
  "epoch": 6.55,
607
+ "learning_rate": 0.00016077844460203207,
608
+ "loss": 0.0536,
609
  "step": 360
610
  },
611
  {
612
  "epoch": 6.62,
613
+ "learning_rate": 0.00015487911938376925,
614
+ "loss": 0.0503,
615
  "step": 364
616
  },
617
  {
618
  "epoch": 6.69,
619
+ "learning_rate": 0.00014904109436216883,
620
+ "loss": 0.0507,
621
  "step": 368
622
  },
623
  {
624
  "epoch": 6.76,
625
+ "learning_rate": 0.00014326813182453956,
626
+ "loss": 0.0572,
627
  "step": 372
628
  },
629
  {
630
  "epoch": 6.84,
631
+ "learning_rate": 0.0001375639521289836,
632
+ "loss": 0.0575,
633
  "step": 376
634
  },
635
  {
636
  "epoch": 6.91,
637
+ "learning_rate": 0.00013193223130682935,
638
+ "loss": 0.0546,
639
  "step": 380
640
  },
641
  {
642
  "epoch": 6.98,
643
+ "learning_rate": 0.00012637659869363084,
644
+ "loss": 0.0489,
645
  "step": 384
646
  },
647
  {
648
  "epoch": 7.0,
649
+ "gpt4_scores": 0.5259999999999999,
650
  "step": 385
651
  },
652
  {
653
  "epoch": 7.0,
654
+ "std": 0.05718811065247741,
655
  "step": 385
656
  },
657
  {
658
  "epoch": 7.0,
659
+ "eval_loss": 3.2638301849365234,
660
+ "eval_runtime": 4.9708,
661
+ "eval_samples_per_second": 4.627,
662
+ "eval_steps_per_second": 1.207,
663
  "step": 385
664
  },
665
  {
666
  "epoch": 7.05,
667
+ "learning_rate": 0.00012090063459025954,
668
+ "loss": 0.0396,
669
  "step": 388
670
  },
671
  {
672
  "epoch": 7.13,
673
+ "learning_rate": 0.0001155078679555969,
674
+ "loss": 0.0441,
675
  "step": 392
676
  },
677
  {
678
  "epoch": 7.2,
679
+ "learning_rate": 0.00011020177413231333,
680
+ "loss": 0.0413,
681
  "step": 396
682
  },
683
  {
684
  "epoch": 7.27,
685
+ "learning_rate": 0.00010498577260720049,
686
+ "loss": 0.0436,
687
  "step": 400
688
  },
689
  {
690
  "epoch": 7.35,
691
+ "learning_rate": 9.986322480749927e-05,
692
+ "loss": 0.0532,
693
  "step": 404
694
  },
695
  {
696
  "epoch": 7.42,
697
+ "learning_rate": 9.483743193464408e-05,
698
+ "loss": 0.0375,
699
  "step": 408
700
  },
701
  {
702
  "epoch": 7.49,
703
+ "learning_rate": 8.991163283681945e-05,
704
+ "loss": 0.0431,
705
  "step": 412
706
  },
707
  {
708
  "epoch": 7.56,
709
+ "learning_rate": 8.508900192169963e-05,
710
+ "loss": 0.0449,
711
  "step": 416
712
  },
713
  {
714
  "epoch": 7.64,
715
+ "learning_rate": 8.037264711071699e-05,
716
+ "loss": 0.0502,
717
  "step": 420
718
  },
719
  {
720
  "epoch": 7.71,
721
+ "learning_rate": 7.576560783617667e-05,
722
+ "loss": 0.038,
723
  "step": 424
724
  },
725
  {
726
  "epoch": 7.78,
727
+ "learning_rate": 7.127085308250913e-05,
728
+ "loss": 0.0483,
729
  "step": 428
730
  },
731
  {
732
  "epoch": 7.85,
733
+ "learning_rate": 6.689127947292231e-05,
734
+ "loss": 0.0411,
735
  "step": 432
736
  },
737
  {
738
  "epoch": 7.93,
739
+ "learning_rate": 6.262970940268654e-05,
740
+ "loss": 0.0529,
741
  "step": 436
742
  },
743
  {
744
  "epoch": 8.0,
745
+ "learning_rate": 5.848888922025553e-05,
746
+ "loss": 0.0559,
747
  "step": 440
748
  },
749
  {
750
  "epoch": 8.0,
751
+ "gpt4_scores": 0.574,
752
  "step": 440
753
  },
754
  {
755
  "epoch": 8.0,
756
+ "std": 0.056200355870759375,
757
  "step": 440
758
  },
759
  {
760
  "epoch": 8.0,
761
+ "eval_loss": 3.374971389770508,
762
+ "eval_runtime": 4.9393,
763
+ "eval_samples_per_second": 4.657,
764
+ "eval_steps_per_second": 1.215,
765
  "step": 440
766
  },
767
  {
768
  "epoch": 8.07,
769
+ "learning_rate": 5.4471487457395216e-05,
770
+ "loss": 0.0389,
771
  "step": 444
772
  },
773
  {
774
  "epoch": 8.15,
775
+ "learning_rate": 5.058009310946118e-05,
776
+ "loss": 0.0445,
777
  "step": 448
778
  },
779
  {
780
  "epoch": 8.22,
781
+ "learning_rate": 4.6817213966933034e-05,
782
+ "loss": 0.0467,
783
  "step": 452
784
  },
785
  {
786
  "epoch": 8.29,
787
+ "learning_rate": 4.318527499928074e-05,
788
+ "loss": 0.0357,
789
  "step": 456
790
  },
791
  {
792
  "epoch": 8.36,
793
+ "learning_rate": 3.968661679220467e-05,
794
+ "loss": 0.0469,
795
  "step": 460
796
  },
797
  {
798
  "epoch": 8.44,
799
+ "learning_rate": 3.632349403925664e-05,
800
+ "loss": 0.0503,
801
  "step": 464
802
  },
803
  {
804
  "epoch": 8.51,
805
+ "learning_rate": 3.309807408881269e-05,
806
+ "loss": 0.0419,
807
  "step": 468
808
  },
809
  {
810
  "epoch": 8.58,
811
+ "learning_rate": 3.0012435547336736e-05,
812
+ "loss": 0.0437,
813
  "step": 472
814
  },
815
  {
816
  "epoch": 8.65,
817
+ "learning_rate": 2.7068566939831645e-05,
818
+ "loss": 0.0388,
819
  "step": 476
820
  },
821
  {
822
  "epoch": 8.73,
823
+ "learning_rate": 2.4268365428344735e-05,
824
+ "loss": 0.0379,
825
  "step": 480
826
  },
827
  {
828
  "epoch": 8.8,
829
+ "learning_rate": 2.1613635589349755e-05,
830
+ "loss": 0.0442,
831
  "step": 484
832
  },
833
  {
834
  "epoch": 8.87,
835
+ "learning_rate": 1.9106088250797264e-05,
836
+ "loss": 0.0529,
837
  "step": 488
838
  },
839
  {
840
  "epoch": 8.95,
841
+ "learning_rate": 1.674733938957873e-05,
842
+ "loss": 0.0424,
843
  "step": 492
844
  },
845
  {
846
  "epoch": 9.0,
847
+ "gpt4_scores": 0.586,
848
  "step": 495
849
  },
850
  {
851
  "epoch": 9.0,
852
+ "std": 0.05261254603229158,
853
  "step": 495
854
  },
855
  {
856
  "epoch": 9.0,
857
+ "eval_loss": 3.401412010192871,
858
+ "eval_runtime": 4.9704,
859
+ "eval_samples_per_second": 4.627,
860
+ "eval_steps_per_second": 1.207,
861
  "step": 495
862
  },
863
  {
864
  "epoch": 9.02,
865
+ "learning_rate": 1.4538909090118846e-05,
866
+ "loss": 0.0488,
867
  "step": 496
868
  },
869
  {
870
  "epoch": 9.09,
871
+ "learning_rate": 1.2482220564763668e-05,
872
+ "loss": 0.0406,
873
  "step": 500
874
  },
875
  {
876
  "epoch": 9.16,
877
+ "learning_rate": 1.0578599236598707e-05,
878
+ "loss": 0.0387,
879
  "step": 504
880
  },
881
  {
882
  "epoch": 9.24,
883
+ "learning_rate": 8.829271885286095e-06,
884
+ "loss": 0.0413,
885
  "step": 508
886
  },
887
  {
888
  "epoch": 9.31,
889
+ "learning_rate": 7.235365856472442e-06,
890
+ "loss": 0.0512,
891
  "step": 512
892
  },
893
  {
894
  "epoch": 9.38,
895
+ "learning_rate": 5.797908335276214e-06,
896
+ "loss": 0.0451,
897
  "step": 516
898
  },
899
  {
900
  "epoch": 9.45,
901
+ "learning_rate": 4.517825684323323e-06,
902
+ "loss": 0.0385,
903
  "step": 520
904
  },
905
  {
906
  "epoch": 9.53,
907
+ "learning_rate": 3.3959428467570664e-06,
908
+ "loss": 0.0377,
909
  "step": 524
910
  },
911
  {
912
  "epoch": 9.6,
913
+ "learning_rate": 2.4329828146074094e-06,
914
+ "loss": 0.0434,
915
  "step": 528
916
  },
917
  {
918
  "epoch": 9.67,
919
+ "learning_rate": 1.6295661628624448e-06,
920
+ "loss": 0.0442,
921
  "step": 532
922
  },
923
  {
924
  "epoch": 9.75,
925
+ "learning_rate": 9.862106495415469e-07,
926
+ "loss": 0.0418,
927
  "step": 536
928
  },
929
  {
930
  "epoch": 9.82,
931
+ "learning_rate": 5.033308820289185e-07,
932
+ "loss": 0.0457,
933
  "step": 540
934
  },
935
  {
936
  "epoch": 9.89,
937
+ "learning_rate": 1.8123804988159908e-07,
938
+ "loss": 0.0428,
939
  "step": 544
940
  },
941
  {
942
  "epoch": 9.96,
943
+ "learning_rate": 2.0139724285161975e-08,
944
+ "loss": 0.0449,
945
  "step": 548
946
  },
947
  {
948
  "epoch": 10.0,
949
+ "step": 550,
950
+ "total_flos": 1.879358707856179e+16,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
951
  "train_loss": 0.0,
952
+ "train_runtime": 16.9739,
953
+ "train_samples_per_second": 127.843,
954
+ "train_steps_per_second": 32.403
955
  }
956
  ],
957
  "logging_steps": 4,
958
+ "max_steps": 550,
959
  "num_input_tokens_seen": 0,
960
+ "num_train_epochs": 10,
961
  "save_steps": 55,
962
+ "total_flos": 1.879358707856179e+16,
963
  "train_batch_size": 4,
964
  "trial_name": null,
965
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39c0cbcdecc598ea92cc8497e5878c5330ccd30898cfdcd997c51d3493250c93
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15469a96348f5cb785a51a156b48ad22e68226bd7cb275db840025eac257b8ea
3
  size 4792