TinyPixel commited on
Commit
200976b
·
1 Parent(s): 185f75e

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -19,8 +19,8 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "c_attn",
23
- "c_proj"
24
  ],
25
  "task_type": "CAUSAL_LM",
26
  "use_rslora": false
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
+ "c_proj",
23
+ "c_attn"
24
  ],
25
  "task_type": "CAUSAL_LM",
26
  "use_rslora": false
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fad1f0f20157a26f734a074eb783171c3e37f18de3fafb92563b93d66891e83
3
  size 121915776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3ba49f73a389971e386b4f6a21d7d4552dbea70aed7d8edffa40c1e9dc9fa7d
3
  size 121915776
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:becf57dfb4ed283919aa9e5a48f763a3948e4ce33b078cc5ce42616b9a524088
3
- size 243883386
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:699481cdb4057dad1811072a3784c5c230ed68cc7b5dd2db650491588235f3f7
3
+ size 243883194
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77b677e0ba2320f567b86677ab2ccf21c5f01a3f7757ff0850a3346d549280bc
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:242671a394d6ab31de2cba9a24ffa0e05edcd8ff2eccd478f7b1d3b9cfdac40f
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c94c7d4ad946c884217b7f94a724b7226f83f3d4a0eace0f817add9b2e3e41e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ced1bbbbd6439b37716ff72c4d2a98af0eeaed72dba8340a1ae4eb8f8b02ec40
3
  size 1064
trainer_state.json CHANGED
@@ -1,1676 +1,620 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.9979633401221997,
5
  "eval_steps": 500,
6
- "global_step": 552,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
- "learning_rate": 2.3529411764705885e-06,
14
- "loss": 1.3358,
15
  "step": 2
16
  },
17
  {
18
- "epoch": 0.02,
19
- "learning_rate": 4.705882352941177e-06,
20
- "loss": 1.5405,
21
  "step": 4
22
  },
23
  {
24
- "epoch": 0.03,
25
- "learning_rate": 7.058823529411766e-06,
26
- "loss": 1.4264,
27
  "step": 6
28
  },
29
  {
30
- "epoch": 0.04,
31
- "learning_rate": 9.411764705882354e-06,
32
- "loss": 1.4248,
33
  "step": 8
34
  },
35
  {
36
- "epoch": 0.05,
37
- "learning_rate": 1.1764705882352942e-05,
38
- "loss": 1.6001,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.07,
43
- "learning_rate": 1.4117647058823532e-05,
44
- "loss": 1.7231,
45
  "step": 12
46
  },
47
  {
48
- "epoch": 0.08,
49
- "learning_rate": 1.647058823529412e-05,
50
- "loss": 1.4676,
51
  "step": 14
52
  },
53
  {
54
- "epoch": 0.09,
55
- "learning_rate": 1.8823529411764708e-05,
56
- "loss": 1.5217,
57
  "step": 16
58
  },
59
  {
60
- "epoch": 0.1,
61
- "learning_rate": 1.999982759060109e-05,
62
- "loss": 1.5314,
63
  "step": 18
64
  },
65
  {
66
- "epoch": 0.11,
67
- "learning_rate": 1.999844835107957e-05,
68
- "loss": 1.5114,
69
  "step": 20
70
  },
71
  {
72
- "epoch": 0.12,
73
- "learning_rate": 1.9995690062269985e-05,
74
- "loss": 1.4863,
75
  "step": 22
76
  },
77
  {
78
- "epoch": 0.13,
79
- "learning_rate": 1.9991553104612982e-05,
80
- "loss": 1.6274,
81
  "step": 24
82
  },
83
  {
84
- "epoch": 0.14,
85
- "learning_rate": 1.998603804870395e-05,
86
- "loss": 1.677,
87
  "step": 26
88
  },
89
  {
90
- "epoch": 0.15,
91
- "learning_rate": 1.9979145655214306e-05,
92
- "loss": 1.7467,
93
  "step": 28
94
  },
95
  {
96
- "epoch": 0.16,
97
- "learning_rate": 1.99708768747866e-05,
98
- "loss": 1.8311,
99
  "step": 30
100
  },
101
  {
102
- "epoch": 0.17,
103
- "learning_rate": 1.996123284790336e-05,
104
- "loss": 1.9136,
105
  "step": 32
106
  },
107
  {
108
- "epoch": 0.18,
109
- "learning_rate": 1.995021490472983e-05,
110
- "loss": 1.8733,
111
  "step": 34
112
  },
113
  {
114
- "epoch": 0.2,
115
- "learning_rate": 1.9937824564930474e-05,
116
- "loss": 1.8744,
117
  "step": 36
118
  },
119
  {
120
- "epoch": 0.21,
121
- "learning_rate": 1.992406353745939e-05,
122
- "loss": 2.034,
123
  "step": 38
124
  },
125
  {
126
- "epoch": 0.22,
127
- "learning_rate": 1.990893372032459e-05,
128
- "loss": 2.0344,
129
  "step": 40
130
  },
131
  {
132
- "epoch": 0.23,
133
- "learning_rate": 1.989243720032624e-05,
134
- "loss": 2.0093,
135
  "step": 42
136
  },
137
  {
138
- "epoch": 0.24,
139
- "learning_rate": 1.9874576252768793e-05,
140
- "loss": 2.3547,
141
  "step": 44
142
  },
143
  {
144
- "epoch": 0.25,
145
- "learning_rate": 1.98553533411472e-05,
146
- "loss": 2.2031,
147
  "step": 46
148
  },
149
  {
150
- "epoch": 0.26,
151
- "learning_rate": 1.983477111680712e-05,
152
- "loss": 1.3899,
153
  "step": 48
154
  },
155
  {
156
- "epoch": 0.27,
157
- "learning_rate": 1.981283241857922e-05,
158
- "loss": 1.3528,
159
  "step": 50
160
  },
161
  {
162
- "epoch": 0.28,
163
- "learning_rate": 1.978954027238763e-05,
164
- "loss": 1.4511,
165
  "step": 52
166
  },
167
  {
168
- "epoch": 0.29,
169
- "learning_rate": 1.9764897890832597e-05,
170
- "loss": 1.6028,
171
  "step": 54
172
  },
173
  {
174
- "epoch": 0.3,
175
- "learning_rate": 1.973890867274738e-05,
176
- "loss": 1.5308,
177
  "step": 56
178
  },
179
  {
180
- "epoch": 0.32,
181
- "learning_rate": 1.9711576202729445e-05,
182
- "loss": 1.3301,
183
  "step": 58
184
  },
185
  {
186
- "epoch": 0.33,
187
- "learning_rate": 1.9682904250646084e-05,
188
- "loss": 1.5238,
189
  "step": 60
190
  },
191
  {
192
- "epoch": 0.34,
193
- "learning_rate": 1.9652896771114416e-05,
194
- "loss": 1.5662,
195
  "step": 62
196
  },
197
  {
198
- "epoch": 0.35,
199
- "learning_rate": 1.962155790295597e-05,
200
- "loss": 1.6441,
201
  "step": 64
202
  },
203
  {
204
- "epoch": 0.36,
205
- "learning_rate": 1.9588891968625828e-05,
206
- "loss": 1.3808,
207
  "step": 66
208
  },
209
  {
210
- "epoch": 0.37,
211
- "learning_rate": 1.9554903473616432e-05,
212
- "loss": 1.6823,
213
  "step": 68
214
  },
215
  {
216
- "epoch": 0.38,
217
- "learning_rate": 1.951959710583616e-05,
218
- "loss": 1.6271,
219
  "step": 70
220
  },
221
  {
222
- "epoch": 0.39,
223
- "learning_rate": 1.9482977734962753e-05,
224
- "loss": 1.6656,
225
  "step": 72
226
  },
227
  {
228
- "epoch": 0.4,
229
- "learning_rate": 1.9445050411771648e-05,
230
- "loss": 1.5312,
231
  "step": 74
232
  },
233
  {
234
- "epoch": 0.41,
235
- "learning_rate": 1.9405820367439343e-05,
236
- "loss": 1.5661,
237
  "step": 76
238
  },
239
  {
240
- "epoch": 0.42,
241
- "learning_rate": 1.9365293012821887e-05,
242
- "loss": 1.988,
243
  "step": 78
244
  },
245
  {
246
- "epoch": 0.43,
247
- "learning_rate": 1.9323473937708565e-05,
248
- "loss": 1.5985,
249
  "step": 80
250
  },
251
  {
252
- "epoch": 0.45,
253
- "learning_rate": 1.9280368910050943e-05,
254
- "loss": 1.7539,
255
  "step": 82
256
  },
257
  {
258
- "epoch": 0.46,
259
- "learning_rate": 1.9235983875167296e-05,
260
- "loss": 1.7707,
261
  "step": 84
262
  },
263
  {
264
- "epoch": 0.47,
265
- "learning_rate": 1.9190324954922594e-05,
266
- "loss": 1.9789,
267
  "step": 86
268
  },
269
  {
270
- "epoch": 0.48,
271
- "learning_rate": 1.914339844688415e-05,
272
- "loss": 2.0714,
273
  "step": 88
274
  },
275
  {
276
- "epoch": 0.49,
277
- "learning_rate": 1.9095210823452997e-05,
278
- "loss": 1.999,
279
  "step": 90
280
  },
281
  {
282
- "epoch": 0.5,
283
- "learning_rate": 1.9045768730971198e-05,
284
- "loss": 2.2815,
285
  "step": 92
286
  },
287
  {
288
- "epoch": 0.51,
289
- "learning_rate": 1.899507898880512e-05,
290
- "loss": 1.3301,
291
  "step": 94
292
  },
293
  {
294
- "epoch": 0.52,
295
- "learning_rate": 1.8943148588404877e-05,
296
- "loss": 1.2994,
297
  "step": 96
298
  },
299
  {
300
- "epoch": 0.53,
301
- "learning_rate": 1.8889984692340015e-05,
302
- "loss": 1.3904,
303
  "step": 98
304
  },
305
  {
306
- "epoch": 0.54,
307
- "learning_rate": 1.883559463331162e-05,
308
- "loss": 1.4148,
309
  "step": 100
310
  },
311
  {
312
- "epoch": 0.55,
313
- "learning_rate": 1.8779985913140927e-05,
314
- "loss": 1.4958,
315
  "step": 102
316
  },
317
  {
318
- "epoch": 0.56,
319
- "learning_rate": 1.8723166201734626e-05,
320
- "loss": 1.4387,
321
  "step": 104
322
  },
323
  {
324
- "epoch": 0.58,
325
- "learning_rate": 1.8665143336027e-05,
326
- "loss": 1.5561,
327
  "step": 106
328
  },
329
  {
330
- "epoch": 0.59,
331
- "learning_rate": 1.8605925318898973e-05,
332
- "loss": 1.4122,
333
  "step": 108
334
  },
335
  {
336
- "epoch": 0.6,
337
- "learning_rate": 1.8545520318074328e-05,
338
- "loss": 1.4326,
339
  "step": 110
340
  },
341
  {
342
- "epoch": 0.61,
343
- "learning_rate": 1.8483936664993152e-05,
344
- "loss": 1.5037,
345
  "step": 112
346
  },
347
  {
348
- "epoch": 0.62,
349
- "learning_rate": 1.8421182853662704e-05,
350
- "loss": 1.5058,
351
  "step": 114
352
  },
353
  {
354
- "epoch": 0.63,
355
- "learning_rate": 1.835726753948589e-05,
356
- "loss": 1.4941,
357
  "step": 116
358
  },
359
  {
360
- "epoch": 0.64,
361
- "learning_rate": 1.829219953806743e-05,
362
- "loss": 1.532,
363
  "step": 118
364
  },
365
  {
366
- "epoch": 0.65,
367
- "learning_rate": 1.8225987823997967e-05,
368
- "loss": 1.6664,
369
  "step": 120
370
  },
371
  {
372
- "epoch": 0.66,
373
- "learning_rate": 1.815864152961624e-05,
374
- "loss": 1.4904,
375
  "step": 122
376
  },
377
  {
378
- "epoch": 0.67,
379
- "learning_rate": 1.8090169943749477e-05,
380
- "loss": 1.5447,
381
  "step": 124
382
  },
383
  {
384
- "epoch": 0.68,
385
- "learning_rate": 1.8020582510432234e-05,
386
- "loss": 1.7077,
387
  "step": 126
388
  },
389
  {
390
- "epoch": 0.7,
391
- "learning_rate": 1.7949888827603813e-05,
392
- "loss": 1.7231,
393
  "step": 128
394
  },
395
  {
396
- "epoch": 0.71,
397
- "learning_rate": 1.7878098645784447e-05,
398
- "loss": 1.9262,
399
  "step": 130
400
  },
401
  {
402
- "epoch": 0.72,
403
- "learning_rate": 1.780522186673046e-05,
404
- "loss": 1.9847,
405
  "step": 132
406
  },
407
  {
408
- "epoch": 0.73,
409
- "learning_rate": 1.7731268542068536e-05,
410
- "loss": 1.9705,
411
  "step": 134
412
  },
413
  {
414
- "epoch": 0.74,
415
- "learning_rate": 1.7656248871909346e-05,
416
- "loss": 2.1022,
417
  "step": 136
418
  },
419
  {
420
- "epoch": 0.75,
421
- "learning_rate": 1.758017320344068e-05,
422
- "loss": 2.2104,
423
  "step": 138
424
  },
425
  {
426
- "epoch": 0.76,
427
- "learning_rate": 1.7503052029500308e-05,
428
- "loss": 1.3489,
429
  "step": 140
430
  },
431
  {
432
- "epoch": 0.77,
433
- "learning_rate": 1.7424895987128723e-05,
434
- "loss": 1.4892,
435
  "step": 142
436
  },
437
  {
438
- "epoch": 0.78,
439
- "learning_rate": 1.7345715856102024e-05,
440
- "loss": 1.374,
441
  "step": 144
442
  },
443
  {
444
- "epoch": 0.79,
445
- "learning_rate": 1.7265522557445115e-05,
446
- "loss": 1.456,
447
  "step": 146
448
  },
449
  {
450
- "epoch": 0.8,
451
- "learning_rate": 1.7184327151925366e-05,
452
- "loss": 1.4002,
453
  "step": 148
454
  },
455
  {
456
- "epoch": 0.81,
457
- "learning_rate": 1.710214083852709e-05,
458
- "loss": 1.5387,
459
  "step": 150
460
  },
461
  {
462
- "epoch": 0.83,
463
- "learning_rate": 1.7018974952906885e-05,
464
- "loss": 1.7009,
465
  "step": 152
466
  },
467
  {
468
- "epoch": 0.84,
469
- "learning_rate": 1.693484096583014e-05,
470
- "loss": 1.5339,
471
  "step": 154
472
  },
473
  {
474
- "epoch": 0.85,
475
- "learning_rate": 1.6849750481588936e-05,
476
- "loss": 1.6124,
477
  "step": 156
478
  },
479
  {
480
- "epoch": 0.86,
481
- "learning_rate": 1.6763715236401493e-05,
482
- "loss": 1.6115,
483
  "step": 158
484
  },
485
  {
486
- "epoch": 0.87,
487
- "learning_rate": 1.667674709679344e-05,
488
- "loss": 1.5313,
489
  "step": 160
490
  },
491
  {
492
- "epoch": 0.88,
493
- "learning_rate": 1.658885805796111e-05,
494
- "loss": 1.4972,
495
  "step": 162
496
  },
497
  {
498
- "epoch": 0.89,
499
- "learning_rate": 1.6500060242117096e-05,
500
- "loss": 1.5446,
501
  "step": 164
502
  },
503
  {
504
- "epoch": 0.9,
505
- "learning_rate": 1.6410365896818253e-05,
506
- "loss": 1.4892,
507
  "step": 166
508
  },
509
  {
510
- "epoch": 0.91,
511
- "learning_rate": 1.6319787393276463e-05,
512
- "loss": 1.6493,
513
  "step": 168
514
  },
515
  {
516
- "epoch": 0.92,
517
- "learning_rate": 1.6228337224652307e-05,
518
- "loss": 1.8135,
519
  "step": 170
520
  },
521
  {
522
- "epoch": 0.93,
523
- "learning_rate": 1.613602800433194e-05,
524
- "loss": 1.8594,
525
  "step": 172
526
  },
527
  {
528
- "epoch": 0.95,
529
- "learning_rate": 1.6042872464187352e-05,
530
- "loss": 1.8211,
531
  "step": 174
532
  },
533
  {
534
- "epoch": 0.96,
535
- "learning_rate": 1.5948883452820326e-05,
536
- "loss": 2.0184,
537
  "step": 176
538
  },
539
  {
540
- "epoch": 0.97,
541
- "learning_rate": 1.5854073933790277e-05,
542
- "loss": 2.2317,
543
  "step": 178
544
  },
545
  {
546
- "epoch": 0.98,
547
- "learning_rate": 1.575845698382622e-05,
548
- "loss": 2.1406,
549
  "step": 180
550
  },
551
  {
552
- "epoch": 0.99,
553
- "learning_rate": 1.566204579102317e-05,
554
- "loss": 2.1213,
555
  "step": 182
556
  },
557
  {
558
- "epoch": 1.0,
559
- "learning_rate": 1.556485365302313e-05,
560
- "loss": 1.8523,
561
  "step": 184
562
  },
563
  {
564
- "epoch": 1.01,
565
- "learning_rate": 1.546689397518101e-05,
566
- "loss": 1.2843,
567
  "step": 186
568
  },
569
  {
570
- "epoch": 1.02,
571
- "learning_rate": 1.5368180268715678e-05,
572
- "loss": 1.412,
573
  "step": 188
574
  },
575
  {
576
- "epoch": 1.03,
577
- "learning_rate": 1.52687261488464e-05,
578
- "loss": 1.4475,
579
  "step": 190
580
  },
581
  {
582
- "epoch": 1.04,
583
- "learning_rate": 1.5168545332914942e-05,
584
- "loss": 1.3569,
585
  "step": 192
586
  },
587
  {
588
- "epoch": 1.05,
589
- "learning_rate": 1.50676516384936e-05,
590
- "loss": 1.4637,
591
  "step": 194
592
  },
593
  {
594
- "epoch": 1.06,
595
- "learning_rate": 1.496605898147938e-05,
596
- "loss": 1.5047,
597
  "step": 196
598
  },
599
  {
600
- "epoch": 1.08,
601
- "learning_rate": 1.4863781374174625e-05,
602
- "loss": 1.4276,
603
  "step": 198
604
  },
605
  {
606
- "epoch": 1.09,
607
- "learning_rate": 1.4760832923354375e-05,
608
- "loss": 1.4892,
609
  "step": 200
610
- },
611
- {
612
- "epoch": 1.1,
613
- "learning_rate": 1.4657227828320637e-05,
614
- "loss": 1.5009,
615
- "step": 202
616
- },
617
- {
618
- "epoch": 1.11,
619
- "learning_rate": 1.4552980378943953e-05,
620
- "loss": 1.4748,
621
- "step": 204
622
- },
623
- {
624
- "epoch": 1.12,
625
- "learning_rate": 1.4448104953692443e-05,
626
- "loss": 1.4862,
627
- "step": 206
628
- },
629
- {
630
- "epoch": 1.13,
631
- "learning_rate": 1.4342616017648632e-05,
632
- "loss": 1.4506,
633
- "step": 208
634
- },
635
- {
636
- "epoch": 1.14,
637
- "learning_rate": 1.423652812051434e-05,
638
- "loss": 1.5143,
639
- "step": 210
640
- },
641
- {
642
- "epoch": 1.15,
643
- "learning_rate": 1.4129855894603885e-05,
644
- "loss": 1.5923,
645
- "step": 212
646
- },
647
- {
648
- "epoch": 1.16,
649
- "learning_rate": 1.4022614052825918e-05,
650
- "loss": 1.4228,
651
- "step": 214
652
- },
653
- {
654
- "epoch": 1.17,
655
- "learning_rate": 1.3914817386654112e-05,
656
- "loss": 1.623,
657
- "step": 216
658
- },
659
- {
660
- "epoch": 1.18,
661
- "learning_rate": 1.3806480764087027e-05,
662
- "loss": 1.7002,
663
- "step": 218
664
- },
665
- {
666
- "epoch": 1.19,
667
- "learning_rate": 1.369761912759744e-05,
668
- "loss": 1.7979,
669
- "step": 220
670
- },
671
- {
672
- "epoch": 1.21,
673
- "learning_rate": 1.358824749207136e-05,
674
- "loss": 1.8978,
675
- "step": 222
676
- },
677
- {
678
- "epoch": 1.22,
679
- "learning_rate": 1.3478380942737097e-05,
680
- "loss": 1.9344,
681
- "step": 224
682
- },
683
- {
684
- "epoch": 1.23,
685
- "learning_rate": 1.3368034633084603e-05,
686
- "loss": 2.0363,
687
- "step": 226
688
- },
689
- {
690
- "epoch": 1.24,
691
- "learning_rate": 1.3257223782775412e-05,
692
- "loss": 2.0626,
693
- "step": 228
694
- },
695
- {
696
- "epoch": 1.25,
697
- "learning_rate": 1.3145963675543451e-05,
698
- "loss": 2.1263,
699
- "step": 230
700
- },
701
- {
702
- "epoch": 1.26,
703
- "learning_rate": 1.3034269657086993e-05,
704
- "loss": 1.2691,
705
- "step": 232
706
- },
707
- {
708
- "epoch": 1.27,
709
- "learning_rate": 1.2922157132952106e-05,
710
- "loss": 1.2686,
711
- "step": 234
712
- },
713
- {
714
- "epoch": 1.28,
715
- "learning_rate": 1.2809641566407802e-05,
716
- "loss": 1.4141,
717
- "step": 236
718
- },
719
- {
720
- "epoch": 1.29,
721
- "learning_rate": 1.2696738476313261e-05,
722
- "loss": 1.4717,
723
- "step": 238
724
- },
725
- {
726
- "epoch": 1.3,
727
- "learning_rate": 1.258346343497736e-05,
728
- "loss": 1.5721,
729
- "step": 240
730
- },
731
- {
732
- "epoch": 1.31,
733
- "learning_rate": 1.2469832066010843e-05,
734
- "loss": 1.4225,
735
- "step": 242
736
- },
737
- {
738
- "epoch": 1.33,
739
- "learning_rate": 1.2355860042171421e-05,
740
- "loss": 1.4003,
741
- "step": 244
742
- },
743
- {
744
- "epoch": 1.34,
745
- "learning_rate": 1.224156308320208e-05,
746
- "loss": 1.499,
747
- "step": 246
748
- },
749
- {
750
- "epoch": 1.35,
751
- "learning_rate": 1.2126956953662914e-05,
752
- "loss": 1.6105,
753
- "step": 248
754
- },
755
- {
756
- "epoch": 1.36,
757
- "learning_rate": 1.2012057460756786e-05,
758
- "loss": 1.5542,
759
- "step": 250
760
- },
761
- {
762
- "epoch": 1.37,
763
- "learning_rate": 1.1896880452149077e-05,
764
- "loss": 1.7083,
765
- "step": 252
766
- },
767
- {
768
- "epoch": 1.38,
769
- "learning_rate": 1.1781441813781911e-05,
770
- "loss": 1.5674,
771
- "step": 254
772
- },
773
- {
774
- "epoch": 1.39,
775
- "learning_rate": 1.1665757467683025e-05,
776
- "loss": 1.5958,
777
- "step": 256
778
- },
779
- {
780
- "epoch": 1.4,
781
- "learning_rate": 1.1549843369769733e-05,
782
- "loss": 1.5586,
783
- "step": 258
784
- },
785
- {
786
- "epoch": 1.41,
787
- "learning_rate": 1.1433715507648173e-05,
788
- "loss": 1.692,
789
- "step": 260
790
- },
791
- {
792
- "epoch": 1.42,
793
- "learning_rate": 1.1317389898408188e-05,
794
- "loss": 1.695,
795
- "step": 262
796
- },
797
- {
798
- "epoch": 1.43,
799
- "learning_rate": 1.1200882586414168e-05,
800
- "loss": 1.8514,
801
- "step": 264
802
- },
803
- {
804
- "epoch": 1.44,
805
- "learning_rate": 1.1084209641092083e-05,
806
- "loss": 1.7508,
807
- "step": 266
808
- },
809
- {
810
- "epoch": 1.46,
811
- "learning_rate": 1.0967387154713104e-05,
812
- "loss": 1.8337,
813
- "step": 268
814
- },
815
- {
816
- "epoch": 1.47,
817
- "learning_rate": 1.0850431240174066e-05,
818
- "loss": 2.0335,
819
- "step": 270
820
- },
821
- {
822
- "epoch": 1.48,
823
- "learning_rate": 1.073335802877504e-05,
824
- "loss": 2.0606,
825
- "step": 272
826
- },
827
- {
828
- "epoch": 1.49,
829
- "learning_rate": 1.0616183667994435e-05,
830
- "loss": 1.8736,
831
- "step": 274
832
- },
833
- {
834
- "epoch": 1.5,
835
- "learning_rate": 1.0498924319261816e-05,
836
- "loss": 2.0607,
837
- "step": 276
838
- },
839
- {
840
- "epoch": 1.51,
841
- "learning_rate": 1.0381596155728823e-05,
842
- "loss": 1.2892,
843
- "step": 278
844
- },
845
- {
846
- "epoch": 1.52,
847
- "learning_rate": 1.0264215360038483e-05,
848
- "loss": 1.3868,
849
- "step": 280
850
- },
851
- {
852
- "epoch": 1.53,
853
- "learning_rate": 1.0146798122093167e-05,
854
- "loss": 1.3625,
855
- "step": 282
856
- },
857
- {
858
- "epoch": 1.54,
859
- "learning_rate": 1.00293606368216e-05,
860
- "loss": 1.3768,
861
- "step": 284
862
- },
863
- {
864
- "epoch": 1.55,
865
- "learning_rate": 9.91191910194515e-06,
866
- "loss": 1.511,
867
- "step": 286
868
- },
869
- {
870
- "epoch": 1.56,
871
- "learning_rate": 9.79448971574372e-06,
872
- "loss": 1.3749,
873
- "step": 288
874
- },
875
- {
876
- "epoch": 1.58,
877
- "learning_rate": 9.677088674821601e-06,
878
- "loss": 1.6872,
879
- "step": 290
880
- },
881
- {
882
- "epoch": 1.59,
883
- "learning_rate": 9.559732171873524e-06,
884
- "loss": 1.682,
885
- "step": 292
886
- },
887
- {
888
- "epoch": 1.6,
889
- "learning_rate": 9.442436393451252e-06,
890
- "loss": 1.4874,
891
- "step": 294
892
- },
893
- {
894
- "epoch": 1.61,
895
- "learning_rate": 9.325217517731047e-06,
896
- "loss": 1.3793,
897
- "step": 296
898
- },
899
- {
900
- "epoch": 1.62,
901
- "learning_rate": 9.208091712282261e-06,
902
- "loss": 1.6831,
903
- "step": 298
904
- },
905
- {
906
- "epoch": 1.63,
907
- "learning_rate": 9.091075131837399e-06,
908
- "loss": 1.4184,
909
- "step": 300
910
- },
911
- {
912
- "epoch": 1.64,
913
- "learning_rate": 8.974183916063967e-06,
914
- "loss": 1.5594,
915
- "step": 302
916
- },
917
- {
918
- "epoch": 1.65,
919
- "learning_rate": 8.857434187338381e-06,
920
- "loss": 1.5417,
921
- "step": 304
922
- },
923
- {
924
- "epoch": 1.66,
925
- "learning_rate": 8.740842048522268e-06,
926
- "loss": 1.666,
927
- "step": 306
928
- },
929
- {
930
- "epoch": 1.67,
931
- "learning_rate": 8.624423580741462e-06,
932
- "loss": 1.5612,
933
- "step": 308
934
- },
935
- {
936
- "epoch": 1.68,
937
- "learning_rate": 8.508194841167975e-06,
938
- "loss": 1.6074,
939
- "step": 310
940
- },
941
- {
942
- "epoch": 1.69,
943
- "learning_rate": 8.39217186080532e-06,
944
- "loss": 1.6851,
945
- "step": 312
946
- },
947
- {
948
- "epoch": 1.71,
949
- "learning_rate": 8.276370642277383e-06,
950
- "loss": 1.8596,
951
- "step": 314
952
- },
953
- {
954
- "epoch": 1.72,
955
- "learning_rate": 8.160807157621262e-06,
956
- "loss": 1.8975,
957
- "step": 316
958
- },
959
- {
960
- "epoch": 1.73,
961
- "learning_rate": 8.045497346084297e-06,
962
- "loss": 2.0025,
963
- "step": 318
964
- },
965
- {
966
- "epoch": 1.74,
967
- "learning_rate": 7.930457111925616e-06,
968
- "loss": 2.0551,
969
- "step": 320
970
- },
971
- {
972
- "epoch": 1.75,
973
- "learning_rate": 7.815702322222539e-06,
974
- "loss": 2.0083,
975
- "step": 322
976
- },
977
- {
978
- "epoch": 1.76,
979
- "learning_rate": 7.701248804682069e-06,
980
- "loss": 1.4952,
981
- "step": 324
982
- },
983
- {
984
- "epoch": 1.77,
985
- "learning_rate": 7.5871123454578534e-06,
986
- "loss": 1.4294,
987
- "step": 326
988
- },
989
- {
990
- "epoch": 1.78,
991
- "learning_rate": 7.47330868697285e-06,
992
- "loss": 1.483,
993
- "step": 328
994
- },
995
- {
996
- "epoch": 1.79,
997
- "learning_rate": 7.3598535257480244e-06,
998
- "loss": 1.3362,
999
- "step": 330
1000
- },
1001
- {
1002
- "epoch": 1.8,
1003
- "learning_rate": 7.246762510237404e-06,
1004
- "loss": 1.3898,
1005
- "step": 332
1006
- },
1007
- {
1008
- "epoch": 1.81,
1009
- "learning_rate": 7.134051238669722e-06,
1010
- "loss": 1.4447,
1011
- "step": 334
1012
- },
1013
- {
1014
- "epoch": 1.82,
1015
- "learning_rate": 7.021735256897035e-06,
1016
- "loss": 1.4999,
1017
- "step": 336
1018
- },
1019
- {
1020
- "epoch": 1.84,
1021
- "learning_rate": 6.909830056250527e-06,
1022
- "loss": 1.3675,
1023
- "step": 338
1024
- },
1025
- {
1026
- "epoch": 1.85,
1027
- "learning_rate": 6.798351071403839e-06,
1028
- "loss": 1.5689,
1029
- "step": 340
1030
- },
1031
- {
1032
- "epoch": 1.86,
1033
- "learning_rate": 6.687313678244243e-06,
1034
- "loss": 1.3948,
1035
- "step": 342
1036
- },
1037
- {
1038
- "epoch": 1.87,
1039
- "learning_rate": 6.576733191751879e-06,
1040
- "loss": 1.4351,
1041
- "step": 344
1042
- },
1043
- {
1044
- "epoch": 1.88,
1045
- "learning_rate": 6.466624863887437e-06,
1046
- "loss": 1.439,
1047
- "step": 346
1048
- },
1049
- {
1050
- "epoch": 1.89,
1051
- "learning_rate": 6.357003881488499e-06,
1052
- "loss": 1.6186,
1053
- "step": 348
1054
- },
1055
- {
1056
- "epoch": 1.9,
1057
- "learning_rate": 6.247885364174866e-06,
1058
- "loss": 1.5107,
1059
- "step": 350
1060
- },
1061
- {
1062
- "epoch": 1.91,
1063
- "learning_rate": 6.139284362263185e-06,
1064
- "loss": 1.5899,
1065
- "step": 352
1066
- },
1067
- {
1068
- "epoch": 1.92,
1069
- "learning_rate": 6.031215854691097e-06,
1070
- "loss": 1.8549,
1071
- "step": 354
1072
- },
1073
- {
1074
- "epoch": 1.93,
1075
- "learning_rate": 5.923694746951253e-06,
1076
- "loss": 1.8156,
1077
- "step": 356
1078
- },
1079
- {
1080
- "epoch": 1.94,
1081
- "learning_rate": 5.816735869035458e-06,
1082
- "loss": 1.7996,
1083
- "step": 358
1084
- },
1085
- {
1086
- "epoch": 1.96,
1087
- "learning_rate": 5.710353973389215e-06,
1088
- "loss": 1.8799,
1089
- "step": 360
1090
- },
1091
- {
1092
- "epoch": 1.97,
1093
- "learning_rate": 5.604563732876989e-06,
1094
- "loss": 1.951,
1095
- "step": 362
1096
- },
1097
- {
1098
- "epoch": 1.98,
1099
- "learning_rate": 5.4993797387584056e-06,
1100
- "loss": 2.0317,
1101
- "step": 364
1102
- },
1103
- {
1104
- "epoch": 1.99,
1105
- "learning_rate": 5.394816498675772e-06,
1106
- "loss": 2.1165,
1107
- "step": 366
1108
- },
1109
- {
1110
- "epoch": 2.0,
1111
- "learning_rate": 5.290888434653056e-06,
1112
- "loss": 2.1293,
1113
- "step": 368
1114
- },
1115
- {
1116
- "epoch": 2.01,
1117
- "learning_rate": 5.187609881106725e-06,
1118
- "loss": 1.3226,
1119
- "step": 370
1120
- },
1121
- {
1122
- "epoch": 2.02,
1123
- "learning_rate": 5.084995082868658e-06,
1124
- "loss": 1.3705,
1125
- "step": 372
1126
- },
1127
- {
1128
- "epoch": 2.03,
1129
- "learning_rate": 4.983058193221384e-06,
1130
- "loss": 1.3177,
1131
- "step": 374
1132
- },
1133
- {
1134
- "epoch": 2.04,
1135
- "learning_rate": 4.881813271946e-06,
1136
- "loss": 1.3059,
1137
- "step": 376
1138
- },
1139
- {
1140
- "epoch": 2.05,
1141
- "learning_rate": 4.781274283382941e-06,
1142
- "loss": 1.5649,
1143
- "step": 378
1144
- },
1145
- {
1146
- "epoch": 2.06,
1147
- "learning_rate": 4.681455094505938e-06,
1148
- "loss": 1.5704,
1149
- "step": 380
1150
- },
1151
- {
1152
- "epoch": 2.07,
1153
- "learning_rate": 4.58236947300939e-06,
1154
- "loss": 1.617,
1155
- "step": 382
1156
- },
1157
- {
1158
- "epoch": 2.09,
1159
- "learning_rate": 4.4840310854094335e-06,
1160
- "loss": 1.6093,
1161
- "step": 384
1162
- },
1163
- {
1164
- "epoch": 2.1,
1165
- "learning_rate": 4.386453495158983e-06,
1166
- "loss": 1.5376,
1167
- "step": 386
1168
- },
1169
- {
1170
- "epoch": 2.11,
1171
- "learning_rate": 4.289650160776967e-06,
1172
- "loss": 1.4214,
1173
- "step": 388
1174
- },
1175
- {
1176
- "epoch": 2.12,
1177
- "learning_rate": 4.19363443399204e-06,
1178
- "loss": 1.6884,
1179
- "step": 390
1180
- },
1181
- {
1182
- "epoch": 2.13,
1183
- "learning_rate": 4.098419557901036e-06,
1184
- "loss": 1.491,
1185
- "step": 392
1186
- },
1187
- {
1188
- "epoch": 2.14,
1189
- "learning_rate": 4.00401866514238e-06,
1190
- "loss": 1.55,
1191
- "step": 394
1192
- },
1193
- {
1194
- "epoch": 2.15,
1195
- "learning_rate": 3.910444776084777e-06,
1196
- "loss": 1.5974,
1197
- "step": 396
1198
- },
1199
- {
1200
- "epoch": 2.16,
1201
- "learning_rate": 3.817710797031338e-06,
1202
- "loss": 1.4703,
1203
- "step": 398
1204
- },
1205
- {
1206
- "epoch": 2.17,
1207
- "learning_rate": 3.7258295184394743e-06,
1208
- "loss": 1.8296,
1209
- "step": 400
1210
- },
1211
- {
1212
- "epoch": 2.18,
1213
- "learning_rate": 3.6348136131567537e-06,
1214
- "loss": 1.7331,
1215
- "step": 402
1216
- },
1217
- {
1218
- "epoch": 2.19,
1219
- "learning_rate": 3.5446756346729673e-06,
1220
- "loss": 1.8378,
1221
- "step": 404
1222
- },
1223
- {
1224
- "epoch": 2.21,
1225
- "learning_rate": 3.4554280153886967e-06,
1226
- "loss": 1.8124,
1227
- "step": 406
1228
- },
1229
- {
1230
- "epoch": 2.22,
1231
- "learning_rate": 3.3670830649005437e-06,
1232
- "loss": 1.8298,
1233
- "step": 408
1234
- },
1235
- {
1236
- "epoch": 2.23,
1237
- "learning_rate": 3.279652968303313e-06,
1238
- "loss": 2.1179,
1239
- "step": 410
1240
- },
1241
- {
1242
- "epoch": 2.24,
1243
- "learning_rate": 3.1931497845093753e-06,
1244
- "loss": 2.1219,
1245
- "step": 412
1246
- },
1247
- {
1248
- "epoch": 2.25,
1249
- "learning_rate": 3.1075854445854093e-06,
1250
- "loss": 1.8519,
1251
- "step": 414
1252
- },
1253
- {
1254
- "epoch": 2.26,
1255
- "learning_rate": 3.0229717501068133e-06,
1256
- "loss": 1.3978,
1257
- "step": 416
1258
- },
1259
- {
1260
- "epoch": 2.27,
1261
- "learning_rate": 2.9393203715299477e-06,
1262
- "loss": 1.4053,
1263
- "step": 418
1264
- },
1265
- {
1266
- "epoch": 2.28,
1267
- "learning_rate": 2.856642846582469e-06,
1268
- "loss": 1.354,
1269
- "step": 420
1270
- },
1271
- {
1272
- "epoch": 2.29,
1273
- "learning_rate": 2.77495057867198e-06,
1274
- "loss": 1.47,
1275
- "step": 422
1276
- },
1277
- {
1278
- "epoch": 2.3,
1279
- "learning_rate": 2.694254835313187e-06,
1280
- "loss": 1.5642,
1281
- "step": 424
1282
- },
1283
- {
1284
- "epoch": 2.31,
1285
- "learning_rate": 2.6145667465738333e-06,
1286
- "loss": 1.3536,
1287
- "step": 426
1288
- },
1289
- {
1290
- "epoch": 2.32,
1291
- "learning_rate": 2.535897303539554e-06,
1292
- "loss": 1.344,
1293
- "step": 428
1294
- },
1295
- {
1296
- "epoch": 2.34,
1297
- "learning_rate": 2.4582573567979196e-06,
1298
- "loss": 1.4085,
1299
- "step": 430
1300
- },
1301
- {
1302
- "epoch": 2.35,
1303
- "learning_rate": 2.381657614941858e-06,
1304
- "loss": 1.4864,
1305
- "step": 432
1306
- },
1307
- {
1308
- "epoch": 2.36,
1309
- "learning_rate": 2.306108643092647e-06,
1310
- "loss": 1.4518,
1311
- "step": 434
1312
- },
1313
- {
1314
- "epoch": 2.37,
1315
- "learning_rate": 2.2316208614427226e-06,
1316
- "loss": 1.5128,
1317
- "step": 436
1318
- },
1319
- {
1320
- "epoch": 2.38,
1321
- "learning_rate": 2.1582045438184464e-06,
1322
- "loss": 1.3548,
1323
- "step": 438
1324
- },
1325
- {
1326
- "epoch": 2.39,
1327
- "learning_rate": 2.085869816263081e-06,
1328
- "loss": 1.573,
1329
- "step": 440
1330
- },
1331
- {
1332
- "epoch": 2.4,
1333
- "learning_rate": 2.0146266556401405e-06,
1334
- "loss": 1.4981,
1335
- "step": 442
1336
- },
1337
- {
1338
- "epoch": 2.41,
1339
- "learning_rate": 1.944484888257312e-06,
1340
- "loss": 1.5371,
1341
- "step": 444
1342
- },
1343
- {
1344
- "epoch": 2.42,
1345
- "learning_rate": 1.8754541885111631e-06,
1346
- "loss": 1.799,
1347
- "step": 446
1348
- },
1349
- {
1350
- "epoch": 2.43,
1351
- "learning_rate": 1.8075440775527754e-06,
1352
- "loss": 1.7182,
1353
- "step": 448
1354
- },
1355
- {
1356
- "epoch": 2.44,
1357
- "learning_rate": 1.740763921974531e-06,
1358
- "loss": 1.6924,
1359
- "step": 450
1360
- },
1361
- {
1362
- "epoch": 2.45,
1363
- "learning_rate": 1.6751229325182194e-06,
1364
- "loss": 2.0618,
1365
- "step": 452
1366
- },
1367
- {
1368
- "epoch": 2.47,
1369
- "learning_rate": 1.610630162804615e-06,
1370
- "loss": 1.9485,
1371
- "step": 454
1372
- },
1373
- {
1374
- "epoch": 2.48,
1375
- "learning_rate": 1.5472945080847679e-06,
1376
- "loss": 2.1394,
1377
- "step": 456
1378
- },
1379
- {
1380
- "epoch": 2.49,
1381
- "learning_rate": 1.485124704013101e-06,
1382
- "loss": 2.009,
1383
- "step": 458
1384
- },
1385
- {
1386
- "epoch": 2.5,
1387
- "learning_rate": 1.4241293254425337e-06,
1388
- "loss": 2.0365,
1389
- "step": 460
1390
- },
1391
- {
1392
- "epoch": 2.51,
1393
- "learning_rate": 1.3643167852417894e-06,
1394
- "loss": 1.368,
1395
- "step": 462
1396
- },
1397
- {
1398
- "epoch": 2.52,
1399
- "learning_rate": 1.3056953331350297e-06,
1400
- "loss": 1.3758,
1401
- "step": 464
1402
- },
1403
- {
1404
- "epoch": 2.53,
1405
- "learning_rate": 1.2482730545640133e-06,
1406
- "loss": 1.334,
1407
- "step": 466
1408
- },
1409
- {
1410
- "epoch": 2.54,
1411
- "learning_rate": 1.1920578695728903e-06,
1412
- "loss": 1.3182,
1413
- "step": 468
1414
- },
1415
- {
1416
- "epoch": 2.55,
1417
- "learning_rate": 1.137057531715825e-06,
1418
- "loss": 1.4656,
1419
- "step": 470
1420
- },
1421
- {
1422
- "epoch": 2.56,
1423
- "learning_rate": 1.0832796269875757e-06,
1424
- "loss": 1.5246,
1425
- "step": 472
1426
- },
1427
- {
1428
- "epoch": 2.57,
1429
- "learning_rate": 1.0307315727771806e-06,
1430
- "loss": 1.4259,
1431
- "step": 474
1432
- },
1433
- {
1434
- "epoch": 2.59,
1435
- "learning_rate": 9.794206168449127e-07,
1436
- "loss": 1.4863,
1437
- "step": 476
1438
- },
1439
- {
1440
- "epoch": 2.6,
1441
- "learning_rate": 9.293538363226196e-07,
1442
- "loss": 1.5547,
1443
- "step": 478
1444
- },
1445
- {
1446
- "epoch": 2.61,
1447
- "learning_rate": 8.80538136737602e-07,
1448
- "loss": 1.5829,
1449
- "step": 480
1450
- },
1451
- {
1452
- "epoch": 2.62,
1453
- "learning_rate": 8.329802510601559e-07,
1454
- "loss": 1.4331,
1455
- "step": 482
1456
- },
1457
- {
1458
- "epoch": 2.63,
1459
- "learning_rate": 7.866867387749199e-07,
1460
- "loss": 1.5252,
1461
- "step": 484
1462
- },
1463
- {
1464
- "epoch": 2.64,
1465
- "learning_rate": 7.416639849761531e-07,
1466
- "loss": 1.5303,
1467
- "step": 486
1468
- },
1469
- {
1470
- "epoch": 2.65,
1471
- "learning_rate": 6.979181994870587e-07,
1472
- "loss": 1.6454,
1473
- "step": 488
1474
- },
1475
- {
1476
- "epoch": 2.66,
1477
- "learning_rate": 6.554554160032899e-07,
1478
- "loss": 1.7071,
1479
- "step": 490
1480
- },
1481
- {
1482
- "epoch": 2.67,
1483
- "learning_rate": 6.142814912607409e-07,
1484
- "loss": 1.6386,
1485
- "step": 492
1486
- },
1487
- {
1488
- "epoch": 2.68,
1489
- "learning_rate": 5.744021042277437e-07,
1490
- "loss": 1.7614,
1491
- "step": 494
1492
- },
1493
- {
1494
- "epoch": 2.69,
1495
- "learning_rate": 5.358227553218031e-07,
1496
- "loss": 1.9176,
1497
- "step": 496
1498
- },
1499
- {
1500
- "epoch": 2.7,
1501
- "learning_rate": 4.985487656509313e-07,
1502
- "loss": 1.8453,
1503
- "step": 498
1504
- },
1505
- {
1506
- "epoch": 2.72,
1507
- "learning_rate": 4.6258527627973446e-07,
1508
- "loss": 1.8006,
1509
- "step": 500
1510
- },
1511
- {
1512
- "epoch": 2.73,
1513
- "learning_rate": 4.2793724752031807e-07,
1514
- "loss": 2.0535,
1515
- "step": 502
1516
- },
1517
- {
1518
- "epoch": 2.74,
1519
- "learning_rate": 3.9460945824813635e-07,
1520
- "loss": 1.9608,
1521
- "step": 504
1522
- },
1523
- {
1524
- "epoch": 2.75,
1525
- "learning_rate": 3.626065052428551e-07,
1526
- "loss": 2.0253,
1527
- "step": 506
1528
- },
1529
- {
1530
- "epoch": 2.76,
1531
- "learning_rate": 3.3193280255433556e-07,
1532
- "loss": 1.3613,
1533
- "step": 508
1534
- },
1535
- {
1536
- "epoch": 2.77,
1537
- "learning_rate": 3.0259258089382236e-07,
1538
- "loss": 1.3568,
1539
- "step": 510
1540
- },
1541
- {
1542
- "epoch": 2.78,
1543
- "learning_rate": 2.745898870504116e-07,
1544
- "loss": 1.4718,
1545
- "step": 512
1546
- },
1547
- {
1548
- "epoch": 2.79,
1549
- "learning_rate": 2.479285833329015e-07,
1550
- "loss": 1.4528,
1551
- "step": 514
1552
- },
1553
- {
1554
- "epoch": 2.8,
1555
- "learning_rate": 2.226123470370689e-07,
1556
- "loss": 1.3257,
1557
- "step": 516
1558
- },
1559
- {
1560
- "epoch": 2.81,
1561
- "learning_rate": 1.9864466993847808e-07,
1562
- "loss": 1.436,
1563
- "step": 518
1564
- },
1565
- {
1566
- "epoch": 2.82,
1567
- "learning_rate": 1.7602885781087486e-07,
1568
- "loss": 1.5998,
1569
- "step": 520
1570
- },
1571
- {
1572
- "epoch": 2.84,
1573
- "learning_rate": 1.5476802997022812e-07,
1574
- "loss": 1.4146,
1575
- "step": 522
1576
- },
1577
- {
1578
- "epoch": 2.85,
1579
- "learning_rate": 1.3486511884449827e-07,
1580
- "loss": 1.4337,
1581
- "step": 524
1582
- },
1583
- {
1584
- "epoch": 2.86,
1585
- "learning_rate": 1.1632286956917427e-07,
1586
- "loss": 1.4613,
1587
- "step": 526
1588
- },
1589
- {
1590
- "epoch": 2.87,
1591
- "learning_rate": 9.914383960865081e-08,
1592
- "loss": 1.4559,
1593
- "step": 528
1594
- },
1595
- {
1596
- "epoch": 2.88,
1597
- "learning_rate": 8.333039840348833e-08,
1598
- "loss": 1.5955,
1599
- "step": 530
1600
- },
1601
- {
1602
- "epoch": 2.89,
1603
- "learning_rate": 6.888472704359661e-08,
1604
- "loss": 1.5219,
1605
- "step": 532
1606
- },
1607
- {
1608
- "epoch": 2.9,
1609
- "learning_rate": 5.580881796741322e-08,
1610
- "loss": 1.5566,
1611
- "step": 534
1612
- },
1613
- {
1614
- "epoch": 2.91,
1615
- "learning_rate": 4.410447468709001e-08,
1616
- "loss": 1.5424,
1617
- "step": 536
1618
- },
1619
- {
1620
- "epoch": 2.92,
1621
- "learning_rate": 3.377331153974206e-08,
1622
- "loss": 1.5721,
1623
- "step": 538
1624
- },
1625
- {
1626
- "epoch": 2.93,
1627
- "learning_rate": 2.4816753464789177e-08,
1628
- "loss": 1.5114,
1629
- "step": 540
1630
- },
1631
- {
1632
- "epoch": 2.94,
1633
- "learning_rate": 1.7236035807416397e-08,
1634
- "loss": 1.8108,
1635
- "step": 542
1636
- },
1637
- {
1638
- "epoch": 2.95,
1639
- "learning_rate": 1.1032204148191395e-08,
1640
- "loss": 1.7596,
1641
- "step": 544
1642
- },
1643
- {
1644
- "epoch": 2.97,
1645
- "learning_rate": 6.206114158845422e-09,
1646
- "loss": 2.0684,
1647
- "step": 546
1648
- },
1649
- {
1650
- "epoch": 2.98,
1651
- "learning_rate": 2.758431484259916e-09,
1652
- "loss": 1.7532,
1653
- "step": 548
1654
- },
1655
- {
1656
- "epoch": 2.99,
1657
- "learning_rate": 6.896316506554979e-10,
1658
- "loss": 2.1096,
1659
- "step": 550
1660
- },
1661
- {
1662
- "epoch": 3.0,
1663
- "learning_rate": 0.0,
1664
- "loss": 2.1121,
1665
- "step": 552
1666
  }
1667
  ],
1668
  "logging_steps": 2,
1669
- "max_steps": 552,
1670
  "num_input_tokens_seen": 0,
1671
  "num_train_epochs": 3,
1672
  "save_steps": 500,
1673
- "total_flos": 3.105937817991168e+16,
1674
  "train_batch_size": 1,
1675
  "trial_name": null,
1676
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9972041006523766,
5
  "eval_steps": 500,
6
+ "global_step": 201,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "learning_rate": 5.7142857142857145e-06,
14
+ "loss": 2.5851,
15
  "step": 2
16
  },
17
  {
18
+ "epoch": 0.06,
19
+ "learning_rate": 1.1428571428571429e-05,
20
+ "loss": 1.606,
21
  "step": 4
22
  },
23
  {
24
+ "epoch": 0.09,
25
+ "learning_rate": 1.7142857142857142e-05,
26
+ "loss": 1.6376,
27
  "step": 6
28
  },
29
  {
30
+ "epoch": 0.12,
31
+ "learning_rate": 1.9998688836656322e-05,
32
+ "loss": 1.5397,
33
  "step": 8
34
  },
35
  {
36
+ "epoch": 0.15,
37
+ "learning_rate": 1.998820159279591e-05,
38
+ "loss": 1.2479,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.18,
43
+ "learning_rate": 1.9967238104745695e-05,
44
+ "loss": 1.5462,
45
  "step": 12
46
  },
47
  {
48
+ "epoch": 0.21,
49
+ "learning_rate": 1.993582036030978e-05,
50
+ "loss": 1.9254,
51
  "step": 14
52
  },
53
  {
54
+ "epoch": 0.24,
55
+ "learning_rate": 1.9893981312363563e-05,
56
+ "loss": 2.377,
57
  "step": 16
58
  },
59
  {
60
+ "epoch": 0.27,
61
+ "learning_rate": 1.9841764844290744e-05,
62
+ "loss": 2.0347,
63
  "step": 18
64
  },
65
  {
66
+ "epoch": 0.3,
67
+ "learning_rate": 1.977922572395571e-05,
68
+ "loss": 1.5208,
69
  "step": 20
70
  },
71
  {
72
+ "epoch": 0.33,
73
+ "learning_rate": 1.9706429546259592e-05,
74
+ "loss": 1.4954,
75
  "step": 22
76
  },
77
  {
78
+ "epoch": 0.36,
79
+ "learning_rate": 1.9623452664340305e-05,
80
+ "loss": 1.4711,
81
  "step": 24
82
  },
83
  {
84
+ "epoch": 0.39,
85
+ "learning_rate": 1.953038210948861e-05,
86
+ "loss": 1.4482,
87
  "step": 26
88
  },
89
  {
90
+ "epoch": 0.42,
91
+ "learning_rate": 1.9427315499864345e-05,
92
+ "loss": 1.6474,
93
  "step": 28
94
  },
95
  {
96
+ "epoch": 0.45,
97
+ "learning_rate": 1.9314360938108427e-05,
98
+ "loss": 1.9729,
99
  "step": 30
100
  },
101
  {
102
+ "epoch": 0.48,
103
+ "learning_rate": 1.9191636897958123e-05,
104
+ "loss": 2.0591,
105
  "step": 32
106
  },
107
  {
108
+ "epoch": 0.51,
109
+ "learning_rate": 1.905927209998447e-05,
110
+ "loss": 2.4572,
111
  "step": 34
112
  },
113
  {
114
+ "epoch": 0.54,
115
+ "learning_rate": 1.8917405376582144e-05,
116
+ "loss": 1.8854,
117
  "step": 36
118
  },
119
  {
120
+ "epoch": 0.57,
121
+ "learning_rate": 1.876618552635348e-05,
122
+ "loss": 1.5307,
123
  "step": 38
124
  },
125
  {
126
+ "epoch": 0.6,
127
+ "learning_rate": 1.8605771158039253e-05,
128
+ "loss": 1.4091,
129
  "step": 40
130
  },
131
  {
132
+ "epoch": 0.63,
133
+ "learning_rate": 1.8436330524160048e-05,
134
+ "loss": 1.2489,
135
  "step": 42
136
  },
137
  {
138
+ "epoch": 0.66,
139
+ "learning_rate": 1.8258041344542567e-05,
140
+ "loss": 1.4895,
141
  "step": 44
142
  },
143
  {
144
+ "epoch": 0.69,
145
+ "learning_rate": 1.8071090619916095e-05,
146
+ "loss": 1.7542,
147
  "step": 46
148
  },
149
  {
150
+ "epoch": 0.72,
151
+ "learning_rate": 1.7875674435774546e-05,
152
+ "loss": 1.6872,
153
  "step": 48
154
  },
155
  {
156
+ "epoch": 0.75,
157
+ "learning_rate": 1.767199775670986e-05,
158
+ "loss": 2.7505,
159
  "step": 50
160
  },
161
  {
162
+ "epoch": 0.78,
163
+ "learning_rate": 1.7460274211432463e-05,
164
+ "loss": 1.5695,
165
  "step": 52
166
  },
167
  {
168
+ "epoch": 0.81,
169
+ "learning_rate": 1.7240725868704218e-05,
170
+ "loss": 1.3149,
171
  "step": 54
172
  },
173
  {
174
+ "epoch": 0.84,
175
+ "learning_rate": 1.7013583004418994e-05,
176
+ "loss": 1.5899,
177
  "step": 56
178
  },
179
  {
180
+ "epoch": 0.86,
181
+ "learning_rate": 1.6779083860075032e-05,
182
+ "loss": 1.39,
183
  "step": 58
184
  },
185
  {
186
+ "epoch": 0.89,
187
+ "learning_rate": 1.6537474392892527e-05,
188
+ "loss": 1.2605,
189
  "step": 60
190
  },
191
  {
192
+ "epoch": 0.92,
193
+ "learning_rate": 1.6289008017838447e-05,
194
+ "loss": 1.4089,
195
  "step": 62
196
  },
197
  {
198
+ "epoch": 0.95,
199
+ "learning_rate": 1.603394534182925e-05,
200
+ "loss": 1.7018,
201
  "step": 64
202
  },
203
  {
204
+ "epoch": 0.98,
205
+ "learning_rate": 1.5772553890390196e-05,
206
+ "loss": 1.3784,
207
  "step": 66
208
  },
209
  {
210
+ "epoch": 1.01,
211
+ "learning_rate": 1.5505107827058038e-05,
212
+ "loss": 2.1283,
213
  "step": 68
214
  },
215
  {
216
+ "epoch": 1.04,
217
+ "learning_rate": 1.52318876658213e-05,
218
+ "loss": 1.7516,
219
  "step": 70
220
  },
221
  {
222
+ "epoch": 1.07,
223
+ "learning_rate": 1.4953179976899878e-05,
224
+ "loss": 1.3695,
225
  "step": 72
226
  },
227
  {
228
+ "epoch": 1.1,
229
+ "learning_rate": 1.4669277086172406e-05,
230
+ "loss": 1.5351,
231
  "step": 74
232
  },
233
  {
234
+ "epoch": 1.13,
235
+ "learning_rate": 1.4380476768566825e-05,
236
+ "loss": 1.212,
237
  "step": 76
238
  },
239
  {
240
+ "epoch": 1.16,
241
+ "learning_rate": 1.4087081935735565e-05,
242
+ "loss": 1.2777,
243
  "step": 78
244
  },
245
  {
246
+ "epoch": 1.19,
247
+ "learning_rate": 1.378940031834307e-05,
248
+ "loss": 1.4289,
249
  "step": 80
250
  },
251
  {
252
+ "epoch": 1.22,
253
+ "learning_rate": 1.3487744143298822e-05,
254
+ "loss": 1.6633,
255
  "step": 82
256
  },
257
  {
258
+ "epoch": 1.25,
259
+ "learning_rate": 1.3182429806274442e-05,
260
+ "loss": 1.9543,
261
  "step": 84
262
  },
263
  {
264
+ "epoch": 1.28,
265
+ "learning_rate": 1.2873777539848284e-05,
266
+ "loss": 1.7349,
267
  "step": 86
268
  },
269
  {
270
+ "epoch": 1.31,
271
+ "learning_rate": 1.2562111077625723e-05,
272
+ "loss": 1.5843,
273
  "step": 88
274
  },
275
  {
276
+ "epoch": 1.34,
277
+ "learning_rate": 1.2247757314687296e-05,
278
+ "loss": 1.469,
279
  "step": 90
280
  },
281
  {
282
+ "epoch": 1.37,
283
+ "learning_rate": 1.1931045964720882e-05,
284
+ "loss": 1.3608,
285
  "step": 92
286
  },
287
  {
288
+ "epoch": 1.4,
289
+ "learning_rate": 1.1612309214197599e-05,
290
+ "loss": 1.1707,
291
  "step": 94
292
  },
293
  {
294
+ "epoch": 1.43,
295
+ "learning_rate": 1.1291881373954066e-05,
296
+ "loss": 1.4295,
297
  "step": 96
298
  },
299
  {
300
+ "epoch": 1.46,
301
+ "learning_rate": 1.0970098528546482e-05,
302
+ "loss": 1.5818,
303
  "step": 98
304
  },
305
  {
306
+ "epoch": 1.49,
307
+ "learning_rate": 1.0647298183744359e-05,
308
+ "loss": 1.6487,
309
  "step": 100
310
  },
311
  {
312
+ "epoch": 1.52,
313
+ "learning_rate": 1.0323818912533561e-05,
314
+ "loss": 2.4078,
315
  "step": 102
316
  },
317
  {
318
+ "epoch": 1.55,
319
+ "learning_rate": 1e-05,
320
+ "loss": 1.4366,
321
  "step": 104
322
  },
323
  {
324
+ "epoch": 1.58,
325
+ "learning_rate": 9.676181087466444e-06,
326
+ "loss": 1.3339,
327
  "step": 106
328
  },
329
  {
330
+ "epoch": 1.61,
331
+ "learning_rate": 9.352701816255643e-06,
332
+ "loss": 1.2585,
333
  "step": 108
334
  },
335
  {
336
+ "epoch": 1.64,
337
+ "learning_rate": 9.02990147145352e-06,
338
+ "loss": 1.1135,
339
  "step": 110
340
  },
341
  {
342
+ "epoch": 1.67,
343
+ "learning_rate": 8.708118626045939e-06,
344
+ "loss": 1.4447,
345
  "step": 112
346
  },
347
  {
348
+ "epoch": 1.7,
349
+ "learning_rate": 8.387690785802403e-06,
350
+ "loss": 1.747,
351
  "step": 114
352
  },
353
  {
354
+ "epoch": 1.73,
355
+ "learning_rate": 8.068954035279121e-06,
356
+ "loss": 2.0869,
357
  "step": 116
358
  },
359
  {
360
+ "epoch": 1.76,
361
+ "learning_rate": 7.752242685312709e-06,
362
+ "loss": 2.0168,
363
  "step": 118
364
  },
365
  {
366
+ "epoch": 1.79,
367
+ "learning_rate": 7.4378889223742766e-06,
368
+ "loss": 1.5621,
369
  "step": 120
370
  },
371
  {
372
+ "epoch": 1.82,
373
+ "learning_rate": 7.126222460151719e-06,
374
+ "loss": 1.3901,
375
  "step": 122
376
  },
377
  {
378
+ "epoch": 1.85,
379
+ "learning_rate": 6.8175701937255645e-06,
380
+ "loss": 1.2683,
381
  "step": 124
382
  },
383
  {
384
+ "epoch": 1.88,
385
+ "learning_rate": 6.5122558567011775e-06,
386
+ "loss": 1.1724,
387
  "step": 126
388
  },
389
  {
390
+ "epoch": 1.91,
391
+ "learning_rate": 6.210599681656933e-06,
392
+ "loss": 1.5036,
393
  "step": 128
394
  },
395
  {
396
+ "epoch": 1.94,
397
+ "learning_rate": 5.912918064264441e-06,
398
+ "loss": 1.7858,
399
  "step": 130
400
  },
401
  {
402
+ "epoch": 1.97,
403
+ "learning_rate": 5.619523231433177e-06,
404
+ "loss": 1.2173,
405
  "step": 132
406
  },
407
  {
408
+ "epoch": 2.0,
409
+ "learning_rate": 5.330722913827594e-06,
410
+ "loss": 1.4985,
411
  "step": 134
412
  },
413
  {
414
+ "epoch": 2.03,
415
+ "learning_rate": 5.046820023100129e-06,
416
+ "loss": 2.1682,
417
  "step": 136
418
  },
419
  {
420
+ "epoch": 2.06,
421
+ "learning_rate": 4.7681123341787e-06,
422
+ "loss": 1.5148,
423
  "step": 138
424
  },
425
  {
426
+ "epoch": 2.09,
427
+ "learning_rate": 4.494892172941965e-06,
428
+ "loss": 1.3437,
429
  "step": 140
430
  },
431
  {
432
+ "epoch": 2.12,
433
+ "learning_rate": 4.2274461096098085e-06,
434
+ "loss": 1.4119,
435
  "step": 142
436
  },
437
  {
438
+ "epoch": 2.15,
439
+ "learning_rate": 3.966054658170754e-06,
440
+ "loss": 1.1752,
441
  "step": 144
442
  },
443
  {
444
+ "epoch": 2.18,
445
+ "learning_rate": 3.7109919821615546e-06,
446
+ "loss": 1.4334,
447
  "step": 146
448
  },
449
  {
450
+ "epoch": 2.21,
451
+ "learning_rate": 3.4625256071074776e-06,
452
+ "loss": 1.4999,
453
  "step": 148
454
  },
455
  {
456
+ "epoch": 2.24,
457
+ "learning_rate": 3.2209161399249677e-06,
458
+ "loss": 1.4172,
459
  "step": 150
460
  },
461
  {
462
+ "epoch": 2.27,
463
+ "learning_rate": 2.9864169955810085e-06,
464
+ "loss": 2.4804,
465
  "step": 152
466
  },
467
  {
468
+ "epoch": 2.3,
469
+ "learning_rate": 2.759274131295787e-06,
470
+ "loss": 1.9752,
471
  "step": 154
472
  },
473
  {
474
+ "epoch": 2.33,
475
+ "learning_rate": 2.5397257885675396e-06,
476
+ "loss": 1.4638,
477
  "step": 156
478
  },
479
  {
480
+ "epoch": 2.36,
481
+ "learning_rate": 2.328002243290138e-06,
482
+ "loss": 1.4279,
483
  "step": 158
484
  },
485
  {
486
+ "epoch": 2.39,
487
+ "learning_rate": 2.124325564225458e-06,
488
+ "loss": 1.1614,
489
  "step": 160
490
  },
491
  {
492
+ "epoch": 2.42,
493
+ "learning_rate": 1.9289093800839067e-06,
494
+ "loss": 1.3463,
495
  "step": 162
496
  },
497
  {
498
+ "epoch": 2.45,
499
+ "learning_rate": 1.7419586554574364e-06,
500
+ "loss": 1.4851,
501
  "step": 164
502
  },
503
  {
504
+ "epoch": 2.48,
505
+ "learning_rate": 1.5636694758399563e-06,
506
+ "loss": 1.5392,
507
  "step": 166
508
  },
509
  {
510
+ "epoch": 2.51,
511
+ "learning_rate": 1.3942288419607476e-06,
512
+ "loss": 2.1037,
513
  "step": 168
514
  },
515
  {
516
+ "epoch": 2.53,
517
+ "learning_rate": 1.233814473646524e-06,
518
+ "loss": 1.573,
519
  "step": 170
520
  },
521
  {
522
+ "epoch": 2.56,
523
+ "learning_rate": 1.0825946234178575e-06,
524
+ "loss": 1.4296,
525
  "step": 172
526
  },
527
  {
528
+ "epoch": 2.59,
529
+ "learning_rate": 9.407279000155311e-07,
530
+ "loss": 1.5531,
531
  "step": 174
532
  },
533
  {
534
+ "epoch": 2.62,
535
+ "learning_rate": 8.083631020418792e-07,
536
+ "loss": 1.1438,
537
  "step": 176
538
  },
539
  {
540
+ "epoch": 2.65,
541
+ "learning_rate": 6.856390618915775e-07,
542
+ "loss": 1.1934,
543
  "step": 178
544
  },
545
  {
546
+ "epoch": 2.68,
547
+ "learning_rate": 5.726845001356573e-07,
548
+ "loss": 1.5436,
549
  "step": 180
550
  },
551
  {
552
+ "epoch": 2.71,
553
+ "learning_rate": 4.696178905113913e-07,
554
+ "loss": 1.5864,
555
  "step": 182
556
  },
557
  {
558
+ "epoch": 2.74,
559
+ "learning_rate": 3.7654733565969826e-07,
560
+ "loss": 2.6632,
561
  "step": 184
562
  },
563
  {
564
+ "epoch": 2.77,
565
+ "learning_rate": 2.935704537404083e-07,
566
+ "loss": 1.4566,
567
  "step": 186
568
  },
569
  {
570
+ "epoch": 2.8,
571
+ "learning_rate": 2.2077427604429435e-07,
572
+ "loss": 1.3796,
573
  "step": 188
574
  },
575
  {
576
+ "epoch": 2.83,
577
+ "learning_rate": 1.5823515570925763e-07,
578
+ "loss": 1.385,
579
  "step": 190
580
  },
581
  {
582
+ "epoch": 2.86,
583
+ "learning_rate": 1.0601868763643997e-07,
584
+ "loss": 0.9717,
585
  "step": 192
586
  },
587
  {
588
+ "epoch": 2.89,
589
+ "learning_rate": 6.417963969022389e-08,
590
+ "loss": 1.4063,
591
  "step": 194
592
  },
593
  {
594
+ "epoch": 2.92,
595
+ "learning_rate": 3.2761895254306285e-08,
596
+ "loss": 1.6286,
597
  "step": 196
598
  },
599
  {
600
+ "epoch": 2.95,
601
+ "learning_rate": 1.179840720409331e-08,
602
+ "loss": 1.5127,
603
  "step": 198
604
  },
605
  {
606
+ "epoch": 2.98,
607
+ "learning_rate": 1.3111633436779792e-09,
608
+ "loss": 1.1829,
609
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  }
611
  ],
612
  "logging_steps": 2,
613
+ "max_steps": 201,
614
  "num_input_tokens_seen": 0,
615
  "num_train_epochs": 3,
616
  "save_steps": 500,
617
+ "total_flos": 1.4168234860720128e+16,
618
  "train_batch_size": 1,
619
  "trial_name": null,
620
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1d51f270a16671d80488d349ef714abc13dac964658c0a8f0c344a34b1bc033
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f64cf07e44bd49988614f1c5896457855e086b870fd8cb30fb62dd90dfe3010
3
  size 4728