ccore commited on
Commit
6f7fa2a
·
verified ·
1 Parent(s): 539841b

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f246aa6cc0081fa9b7e4332214ed8e193ec105b00011ef4ed4a4e7205284468e
3
  size 1324830880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40b4a1e4ec540bc656d73ff8878ee7be835cf08cb3173699ec17509946916f58
3
  size 1324830880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:704585827dd882d975d15c26c266d1a1f18f0a6f6f8caae429f9df95c732e945
3
  size 2649896094
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8ad656b279f1fb52687fc536d24a1765d0c84180daa22965672e107fd6870e2
3
  size 2649896094
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04b2b615f7283a358a58daa1145605faca5fb68ef23664939df3a945f155385b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4148cdc128d12b940ce70f5dee553010f8414faa77e9b07518cb0e0d0dbbdc85
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36fb99b37addcc915d885e7795ee1e9b31ef9ba8894219ccbdfd5e3c1cf04f2d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:032817f2c288f82ea29ad5b7447cbdb475d0523f1714c8c4435e81aa12e3c82b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1226 +1,327 @@
1
  {
2
- "best_metric": 0.3090362250804901,
3
- "best_model_checkpoint": "./opt_trained/checkpoint-168",
4
- "epoch": 3.912280701754386,
5
  "eval_steps": 500,
6
- "global_step": 168,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.023391812865497075,
13
- "grad_norm": 3.8931937217712402,
14
- "learning_rate": 9.940476190476191e-05,
15
- "loss": 2.5384,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.04678362573099415,
20
- "grad_norm": 39.378150939941406,
21
- "learning_rate": 9.880952380952381e-05,
22
- "loss": 3.5216,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.07017543859649122,
27
- "grad_norm": 14.441329002380371,
28
- "learning_rate": 9.821428571428572e-05,
29
- "loss": 3.0676,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.0935672514619883,
34
- "grad_norm": 11.781136512756348,
35
- "learning_rate": 9.761904761904762e-05,
36
- "loss": 2.8807,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.11695906432748537,
41
- "grad_norm": 7.509276866912842,
42
- "learning_rate": 9.702380952380953e-05,
43
- "loss": 2.793,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.14035087719298245,
48
- "grad_norm": 7.06434440612793,
49
- "learning_rate": 9.642857142857143e-05,
50
- "loss": 2.7181,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.16374269005847952,
55
- "grad_norm": 5.831886291503906,
56
- "learning_rate": 9.583333333333334e-05,
57
- "loss": 2.6268,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.1871345029239766,
62
- "grad_norm": Infinity,
63
- "learning_rate": 9.583333333333334e-05,
64
- "loss": 3.0215,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.21052631578947367,
69
- "grad_norm": 80.48746490478516,
70
- "learning_rate": 9.523809523809524e-05,
71
- "loss": 3.0411,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.23391812865497075,
76
- "grad_norm": 13.120291709899902,
77
- "learning_rate": 9.464285714285715e-05,
78
- "loss": 2.7814,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.2573099415204678,
83
- "grad_norm": 5.15352201461792,
84
- "learning_rate": 9.404761904761905e-05,
85
- "loss": 2.7369,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.2807017543859649,
90
- "grad_norm": 11.227391242980957,
91
- "learning_rate": 9.345238095238095e-05,
92
- "loss": 2.7984,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.30409356725146197,
97
- "grad_norm": 12.440800666809082,
98
- "learning_rate": 9.285714285714286e-05,
99
- "loss": 2.7365,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.32748538011695905,
104
- "grad_norm": 9.62018871307373,
105
- "learning_rate": 9.226190476190478e-05,
106
- "loss": 2.6912,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.3508771929824561,
111
- "grad_norm": 5.563836097717285,
112
- "learning_rate": 9.166666666666667e-05,
113
- "loss": 2.6338,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.3742690058479532,
118
- "grad_norm": 3.563298463821411,
119
- "learning_rate": 9.107142857142857e-05,
120
- "loss": 2.6897,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.39766081871345027,
125
- "grad_norm": 2.9909682273864746,
126
- "learning_rate": 9.047619047619048e-05,
127
- "loss": 2.6602,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.42105263157894735,
132
- "grad_norm": 2.2207834720611572,
133
- "learning_rate": 8.988095238095238e-05,
134
- "loss": 2.6597,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.4444444444444444,
139
- "grad_norm": 1.5034862756729126,
140
- "learning_rate": 8.92857142857143e-05,
141
- "loss": 2.613,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.4678362573099415,
146
- "grad_norm": 1.7152122259140015,
147
- "learning_rate": 8.869047619047619e-05,
148
- "loss": 2.6001,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.49122807017543857,
153
- "grad_norm": 3.2452425956726074,
154
- "learning_rate": 8.80952380952381e-05,
155
- "loss": 2.5923,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.5146198830409356,
160
- "grad_norm": 2.9392387866973877,
161
- "learning_rate": 8.75e-05,
162
- "loss": 2.6475,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.5380116959064327,
167
- "grad_norm": 2.2862868309020996,
168
- "learning_rate": 8.690476190476192e-05,
169
- "loss": 2.6228,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.5614035087719298,
174
- "grad_norm": 3.852323293685913,
175
- "learning_rate": 8.630952380952382e-05,
176
- "loss": 2.6226,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.5847953216374269,
181
- "grad_norm": 4.545846462249756,
182
- "learning_rate": 8.571428571428571e-05,
183
- "loss": 2.5253,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.6081871345029239,
188
- "grad_norm": 3.041779041290283,
189
- "learning_rate": 8.511904761904762e-05,
190
- "loss": 2.6737,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.631578947368421,
195
- "grad_norm": 3.319415807723999,
196
- "learning_rate": 8.452380952380952e-05,
197
- "loss": 2.6138,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.6549707602339181,
202
- "grad_norm": 4.307522773742676,
203
- "learning_rate": 8.392857142857144e-05,
204
- "loss": 2.5785,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.6783625730994152,
209
- "grad_norm": 3.9516043663024902,
210
- "learning_rate": 8.333333333333334e-05,
211
- "loss": 2.5619,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.7017543859649122,
216
- "grad_norm": 2.792844772338867,
217
- "learning_rate": 8.273809523809524e-05,
218
- "loss": 2.6865,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.7251461988304093,
223
- "grad_norm": 2.5754215717315674,
224
- "learning_rate": 8.214285714285714e-05,
225
- "loss": 2.6156,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.7485380116959064,
230
- "grad_norm": 2.5915491580963135,
231
- "learning_rate": 8.154761904761904e-05,
232
- "loss": 2.6147,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.7719298245614035,
237
- "grad_norm": 2.1252636909484863,
238
- "learning_rate": 8.095238095238096e-05,
239
- "loss": 2.5163,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.7953216374269005,
244
- "grad_norm": 4.138577461242676,
245
- "learning_rate": 8.035714285714287e-05,
246
- "loss": 2.5359,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.8187134502923976,
251
- "grad_norm": 4.6266279220581055,
252
- "learning_rate": 7.976190476190477e-05,
253
- "loss": 2.6015,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.8421052631578947,
258
- "grad_norm": 2.5281972885131836,
259
- "learning_rate": 7.916666666666666e-05,
260
- "loss": 2.5575,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.8654970760233918,
265
- "grad_norm": 5.6380109786987305,
266
- "learning_rate": 7.857142857142858e-05,
267
- "loss": 2.6148,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.8888888888888888,
272
- "grad_norm": 2.7080187797546387,
273
- "learning_rate": 7.797619047619048e-05,
274
- "loss": 2.5994,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.9122807017543859,
279
- "grad_norm": 3.670461893081665,
280
- "learning_rate": 7.738095238095239e-05,
281
- "loss": 2.6104,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.935672514619883,
286
- "grad_norm": 3.5698938369750977,
287
- "learning_rate": 7.67857142857143e-05,
288
- "loss": 2.6378,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.9590643274853801,
293
- "grad_norm": 2.317847490310669,
294
- "learning_rate": 7.619047619047618e-05,
295
- "loss": 2.4963,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.9824561403508771,
300
- "grad_norm": 6.330520153045654,
301
- "learning_rate": 7.55952380952381e-05,
302
- "loss": 2.5993,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 1.0,
307
- "grad_norm": 3.272627115249634,
308
- "learning_rate": 7.500000000000001e-05,
309
  "loss": 1.9135,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 1.0,
314
- "eval_loss": 0.323538213968277,
315
- "eval_runtime": 3.3735,
316
- "eval_samples_per_second": 64.029,
317
- "eval_steps_per_second": 16.007,
318
  "step": 43
319
- },
320
- {
321
- "epoch": 1.023391812865497,
322
- "grad_norm": 5.024760723114014,
323
- "learning_rate": 7.440476190476191e-05,
324
- "loss": 2.5518,
325
- "step": 44
326
- },
327
- {
328
- "epoch": 1.0467836257309941,
329
- "grad_norm": 3.8648533821105957,
330
- "learning_rate": 7.380952380952382e-05,
331
- "loss": 2.5287,
332
- "step": 45
333
- },
334
- {
335
- "epoch": 1.0701754385964912,
336
- "grad_norm": 2.0299911499023438,
337
- "learning_rate": 7.321428571428571e-05,
338
- "loss": 2.5216,
339
- "step": 46
340
- },
341
- {
342
- "epoch": 1.0935672514619883,
343
- "grad_norm": 3.747516632080078,
344
- "learning_rate": 7.261904761904762e-05,
345
- "loss": 2.6046,
346
- "step": 47
347
- },
348
- {
349
- "epoch": 1.1169590643274854,
350
- "grad_norm": 3.7166852951049805,
351
- "learning_rate": 7.202380952380953e-05,
352
- "loss": 2.5714,
353
- "step": 48
354
- },
355
- {
356
- "epoch": 1.1403508771929824,
357
- "grad_norm": 5.546866416931152,
358
- "learning_rate": 7.142857142857143e-05,
359
- "loss": 2.555,
360
- "step": 49
361
- },
362
- {
363
- "epoch": 1.1637426900584795,
364
- "grad_norm": 3.976203441619873,
365
- "learning_rate": 7.083333333333334e-05,
366
- "loss": 2.5272,
367
- "step": 50
368
- },
369
- {
370
- "epoch": 1.1871345029239766,
371
- "grad_norm": 2.14218807220459,
372
- "learning_rate": 7.023809523809524e-05,
373
- "loss": 2.5428,
374
- "step": 51
375
- },
376
- {
377
- "epoch": 1.2105263157894737,
378
- "grad_norm": 3.9988062381744385,
379
- "learning_rate": 6.964285714285715e-05,
380
- "loss": 2.5357,
381
- "step": 52
382
- },
383
- {
384
- "epoch": 1.2339181286549707,
385
- "grad_norm": 3.975252151489258,
386
- "learning_rate": 6.904761904761905e-05,
387
- "loss": 2.6062,
388
- "step": 53
389
- },
390
- {
391
- "epoch": 1.2573099415204678,
392
- "grad_norm": 2.3471555709838867,
393
- "learning_rate": 6.845238095238096e-05,
394
- "loss": 2.5316,
395
- "step": 54
396
- },
397
- {
398
- "epoch": 1.280701754385965,
399
- "grad_norm": 1.675955057144165,
400
- "learning_rate": 6.785714285714286e-05,
401
- "loss": 2.5567,
402
- "step": 55
403
- },
404
- {
405
- "epoch": 1.304093567251462,
406
- "grad_norm": 1.435637354850769,
407
- "learning_rate": 6.726190476190477e-05,
408
- "loss": 2.493,
409
- "step": 56
410
- },
411
- {
412
- "epoch": 1.327485380116959,
413
- "grad_norm": 1.7799092531204224,
414
- "learning_rate": 6.666666666666667e-05,
415
- "loss": 2.4916,
416
- "step": 57
417
- },
418
- {
419
- "epoch": 1.3508771929824561,
420
- "grad_norm": 2.0183417797088623,
421
- "learning_rate": 6.607142857142857e-05,
422
- "loss": 2.523,
423
- "step": 58
424
- },
425
- {
426
- "epoch": 1.3742690058479532,
427
- "grad_norm": 2.379734992980957,
428
- "learning_rate": 6.547619047619048e-05,
429
- "loss": 2.5916,
430
- "step": 59
431
- },
432
- {
433
- "epoch": 1.3976608187134503,
434
- "grad_norm": 4.267402172088623,
435
- "learning_rate": 6.488095238095238e-05,
436
- "loss": 2.5291,
437
- "step": 60
438
- },
439
- {
440
- "epoch": 1.4210526315789473,
441
- "grad_norm": 5.298277378082275,
442
- "learning_rate": 6.428571428571429e-05,
443
- "loss": 2.5419,
444
- "step": 61
445
- },
446
- {
447
- "epoch": 1.4444444444444444,
448
- "grad_norm": 2.4158201217651367,
449
- "learning_rate": 6.369047619047619e-05,
450
- "loss": 2.565,
451
- "step": 62
452
- },
453
- {
454
- "epoch": 1.4678362573099415,
455
- "grad_norm": 3.903258800506592,
456
- "learning_rate": 6.30952380952381e-05,
457
- "loss": 2.5234,
458
- "step": 63
459
- },
460
- {
461
- "epoch": 1.4912280701754386,
462
- "grad_norm": 1.7788355350494385,
463
- "learning_rate": 6.25e-05,
464
- "loss": 2.6098,
465
- "step": 64
466
- },
467
- {
468
- "epoch": 1.5146198830409356,
469
- "grad_norm": 4.656391143798828,
470
- "learning_rate": 6.19047619047619e-05,
471
- "loss": 2.5448,
472
- "step": 65
473
- },
474
- {
475
- "epoch": 1.5380116959064327,
476
- "grad_norm": 3.0788934230804443,
477
- "learning_rate": 6.130952380952381e-05,
478
- "loss": 2.5008,
479
- "step": 66
480
- },
481
- {
482
- "epoch": 1.5614035087719298,
483
- "grad_norm": 1.855888843536377,
484
- "learning_rate": 6.0714285714285715e-05,
485
- "loss": 2.5913,
486
- "step": 67
487
- },
488
- {
489
- "epoch": 1.5847953216374269,
490
- "grad_norm": 2.3010807037353516,
491
- "learning_rate": 6.011904761904762e-05,
492
- "loss": 2.6243,
493
- "step": 68
494
- },
495
- {
496
- "epoch": 1.608187134502924,
497
- "grad_norm": 3.488280773162842,
498
- "learning_rate": 5.9523809523809524e-05,
499
- "loss": 2.5247,
500
- "step": 69
501
- },
502
- {
503
- "epoch": 1.631578947368421,
504
- "grad_norm": 2.508605480194092,
505
- "learning_rate": 5.8928571428571435e-05,
506
- "loss": 2.4904,
507
- "step": 70
508
- },
509
- {
510
- "epoch": 1.654970760233918,
511
- "grad_norm": 2.490065336227417,
512
- "learning_rate": 5.833333333333334e-05,
513
- "loss": 2.5192,
514
- "step": 71
515
- },
516
- {
517
- "epoch": 1.6783625730994152,
518
- "grad_norm": 2.5170626640319824,
519
- "learning_rate": 5.773809523809524e-05,
520
- "loss": 2.4772,
521
- "step": 72
522
- },
523
- {
524
- "epoch": 1.7017543859649122,
525
- "grad_norm": 2.740570306777954,
526
- "learning_rate": 5.714285714285714e-05,
527
- "loss": 2.5289,
528
- "step": 73
529
- },
530
- {
531
- "epoch": 1.7251461988304093,
532
- "grad_norm": 3.663625717163086,
533
- "learning_rate": 5.6547619047619046e-05,
534
- "loss": 2.4998,
535
- "step": 74
536
- },
537
- {
538
- "epoch": 1.7485380116959064,
539
- "grad_norm": 3.0818302631378174,
540
- "learning_rate": 5.595238095238096e-05,
541
- "loss": 2.5162,
542
- "step": 75
543
- },
544
- {
545
- "epoch": 1.7719298245614035,
546
- "grad_norm": 4.646051406860352,
547
- "learning_rate": 5.535714285714286e-05,
548
- "loss": 2.4664,
549
- "step": 76
550
- },
551
- {
552
- "epoch": 1.7953216374269005,
553
- "grad_norm": 5.245360374450684,
554
- "learning_rate": 5.4761904761904766e-05,
555
- "loss": 2.5579,
556
- "step": 77
557
- },
558
- {
559
- "epoch": 1.8187134502923976,
560
- "grad_norm": 2.5500850677490234,
561
- "learning_rate": 5.4166666666666664e-05,
562
- "loss": 2.5464,
563
- "step": 78
564
- },
565
- {
566
- "epoch": 1.8421052631578947,
567
- "grad_norm": 3.6748623847961426,
568
- "learning_rate": 5.3571428571428575e-05,
569
- "loss": 2.533,
570
- "step": 79
571
- },
572
- {
573
- "epoch": 1.8654970760233918,
574
- "grad_norm": 3.894373893737793,
575
- "learning_rate": 5.297619047619048e-05,
576
- "loss": 2.5526,
577
- "step": 80
578
- },
579
- {
580
- "epoch": 1.8888888888888888,
581
- "grad_norm": 2.844043731689453,
582
- "learning_rate": 5.2380952380952384e-05,
583
- "loss": 2.5099,
584
- "step": 81
585
- },
586
- {
587
- "epoch": 1.912280701754386,
588
- "grad_norm": 3.2156522274017334,
589
- "learning_rate": 5.1785714285714296e-05,
590
- "loss": 2.4995,
591
- "step": 82
592
- },
593
- {
594
- "epoch": 1.935672514619883,
595
- "grad_norm": 3.4957337379455566,
596
- "learning_rate": 5.119047619047619e-05,
597
- "loss": 2.4958,
598
- "step": 83
599
- },
600
- {
601
- "epoch": 1.95906432748538,
602
- "grad_norm": 2.481201648712158,
603
- "learning_rate": 5.05952380952381e-05,
604
- "loss": 2.5521,
605
- "step": 84
606
- },
607
- {
608
- "epoch": 1.9824561403508771,
609
- "grad_norm": 1.6848102807998657,
610
- "learning_rate": 5e-05,
611
- "loss": 2.4841,
612
- "step": 85
613
- },
614
- {
615
- "epoch": 2.0,
616
- "grad_norm": 1.674892783164978,
617
- "learning_rate": 4.940476190476191e-05,
618
- "loss": 1.931,
619
- "step": 86
620
- },
621
- {
622
- "epoch": 2.0,
623
- "eval_loss": 0.31422847509384155,
624
- "eval_runtime": 3.4783,
625
- "eval_samples_per_second": 62.1,
626
- "eval_steps_per_second": 15.525,
627
- "step": 86
628
- },
629
- {
630
- "epoch": 2.023391812865497,
631
- "grad_norm": 2.143749952316284,
632
- "learning_rate": 4.880952380952381e-05,
633
- "loss": 2.5044,
634
- "step": 87
635
- },
636
- {
637
- "epoch": 2.046783625730994,
638
- "grad_norm": 1.8259425163269043,
639
- "learning_rate": 4.8214285714285716e-05,
640
- "loss": 2.5145,
641
- "step": 88
642
- },
643
- {
644
- "epoch": 2.0701754385964914,
645
- "grad_norm": 2.2776169776916504,
646
- "learning_rate": 4.761904761904762e-05,
647
- "loss": 2.5045,
648
- "step": 89
649
- },
650
- {
651
- "epoch": 2.0935672514619883,
652
- "grad_norm": 2.4573814868927,
653
- "learning_rate": 4.7023809523809525e-05,
654
- "loss": 2.4763,
655
- "step": 90
656
- },
657
- {
658
- "epoch": 2.116959064327485,
659
- "grad_norm": 1.7515789270401,
660
- "learning_rate": 4.642857142857143e-05,
661
- "loss": 2.4691,
662
- "step": 91
663
- },
664
- {
665
- "epoch": 2.1403508771929824,
666
- "grad_norm": 3.7100000381469727,
667
- "learning_rate": 4.5833333333333334e-05,
668
- "loss": 2.5116,
669
- "step": 92
670
- },
671
- {
672
- "epoch": 2.1637426900584797,
673
- "grad_norm": 4.203453540802002,
674
- "learning_rate": 4.523809523809524e-05,
675
- "loss": 2.4578,
676
- "step": 93
677
- },
678
- {
679
- "epoch": 2.1871345029239766,
680
- "grad_norm": 2.1631312370300293,
681
- "learning_rate": 4.464285714285715e-05,
682
- "loss": 2.4542,
683
- "step": 94
684
- },
685
- {
686
- "epoch": 2.2105263157894735,
687
- "grad_norm": 2.008855104446411,
688
- "learning_rate": 4.404761904761905e-05,
689
- "loss": 2.4875,
690
- "step": 95
691
- },
692
- {
693
- "epoch": 2.2339181286549707,
694
- "grad_norm": 2.7923460006713867,
695
- "learning_rate": 4.345238095238096e-05,
696
- "loss": 2.4972,
697
- "step": 96
698
- },
699
- {
700
- "epoch": 2.257309941520468,
701
- "grad_norm": 1.9210076332092285,
702
- "learning_rate": 4.2857142857142856e-05,
703
- "loss": 2.5156,
704
- "step": 97
705
- },
706
- {
707
- "epoch": 2.280701754385965,
708
- "grad_norm": 2.0055480003356934,
709
- "learning_rate": 4.226190476190476e-05,
710
- "loss": 2.4456,
711
- "step": 98
712
- },
713
- {
714
- "epoch": 2.3040935672514617,
715
- "grad_norm": 1.6126084327697754,
716
- "learning_rate": 4.166666666666667e-05,
717
- "loss": 2.4566,
718
- "step": 99
719
- },
720
- {
721
- "epoch": 2.327485380116959,
722
- "grad_norm": 2.9103384017944336,
723
- "learning_rate": 4.107142857142857e-05,
724
- "loss": 2.4987,
725
- "step": 100
726
- },
727
- {
728
- "epoch": 2.3508771929824563,
729
- "grad_norm": 2.317208766937256,
730
- "learning_rate": 4.047619047619048e-05,
731
- "loss": 2.4523,
732
- "step": 101
733
- },
734
- {
735
- "epoch": 2.374269005847953,
736
- "grad_norm": 1.8047789335250854,
737
- "learning_rate": 3.9880952380952386e-05,
738
- "loss": 2.4627,
739
- "step": 102
740
- },
741
- {
742
- "epoch": 2.39766081871345,
743
- "grad_norm": 2.57161545753479,
744
- "learning_rate": 3.928571428571429e-05,
745
- "loss": 2.4551,
746
- "step": 103
747
- },
748
- {
749
- "epoch": 2.4210526315789473,
750
- "grad_norm": 1.9964399337768555,
751
- "learning_rate": 3.8690476190476195e-05,
752
- "loss": 2.452,
753
- "step": 104
754
- },
755
- {
756
- "epoch": 2.4444444444444446,
757
- "grad_norm": 1.4267529249191284,
758
- "learning_rate": 3.809523809523809e-05,
759
- "loss": 2.4888,
760
- "step": 105
761
- },
762
- {
763
- "epoch": 2.4678362573099415,
764
- "grad_norm": 3.7493860721588135,
765
- "learning_rate": 3.7500000000000003e-05,
766
- "loss": 2.5197,
767
- "step": 106
768
- },
769
- {
770
- "epoch": 2.4912280701754383,
771
- "grad_norm": 2.891245126724243,
772
- "learning_rate": 3.690476190476191e-05,
773
- "loss": 2.4384,
774
- "step": 107
775
- },
776
- {
777
- "epoch": 2.5146198830409356,
778
- "grad_norm": 2.032031774520874,
779
- "learning_rate": 3.630952380952381e-05,
780
- "loss": 2.4911,
781
- "step": 108
782
- },
783
- {
784
- "epoch": 2.538011695906433,
785
- "grad_norm": 1.5395787954330444,
786
- "learning_rate": 3.571428571428572e-05,
787
- "loss": 2.4612,
788
- "step": 109
789
- },
790
- {
791
- "epoch": 2.56140350877193,
792
- "grad_norm": 2.0416135787963867,
793
- "learning_rate": 3.511904761904762e-05,
794
- "loss": 2.5042,
795
- "step": 110
796
- },
797
- {
798
- "epoch": 2.5847953216374266,
799
- "grad_norm": 1.8836208581924438,
800
- "learning_rate": 3.4523809523809526e-05,
801
- "loss": 2.455,
802
- "step": 111
803
- },
804
- {
805
- "epoch": 2.608187134502924,
806
- "grad_norm": 2.659785747528076,
807
- "learning_rate": 3.392857142857143e-05,
808
- "loss": 2.4787,
809
- "step": 112
810
- },
811
- {
812
- "epoch": 2.6315789473684212,
813
- "grad_norm": 1.6700447797775269,
814
- "learning_rate": 3.3333333333333335e-05,
815
- "loss": 2.4873,
816
- "step": 113
817
- },
818
- {
819
- "epoch": 2.654970760233918,
820
- "grad_norm": 2.6808204650878906,
821
- "learning_rate": 3.273809523809524e-05,
822
- "loss": 2.5067,
823
- "step": 114
824
- },
825
- {
826
- "epoch": 2.678362573099415,
827
- "grad_norm": 2.243744373321533,
828
- "learning_rate": 3.2142857142857144e-05,
829
- "loss": 2.4924,
830
- "step": 115
831
- },
832
- {
833
- "epoch": 2.7017543859649122,
834
- "grad_norm": 2.5171408653259277,
835
- "learning_rate": 3.154761904761905e-05,
836
- "loss": 2.4808,
837
- "step": 116
838
- },
839
- {
840
- "epoch": 2.7251461988304095,
841
- "grad_norm": 3.511664628982544,
842
- "learning_rate": 3.095238095238095e-05,
843
- "loss": 2.4996,
844
- "step": 117
845
- },
846
- {
847
- "epoch": 2.7485380116959064,
848
- "grad_norm": 2.554797649383545,
849
- "learning_rate": 3.0357142857142857e-05,
850
- "loss": 2.4623,
851
- "step": 118
852
- },
853
- {
854
- "epoch": 2.7719298245614032,
855
- "grad_norm": 2.256316900253296,
856
- "learning_rate": 2.9761904761904762e-05,
857
- "loss": 2.4806,
858
- "step": 119
859
- },
860
- {
861
- "epoch": 2.7953216374269005,
862
- "grad_norm": 2.518251895904541,
863
- "learning_rate": 2.916666666666667e-05,
864
- "loss": 2.4336,
865
- "step": 120
866
- },
867
- {
868
- "epoch": 2.818713450292398,
869
- "grad_norm": 2.2938737869262695,
870
- "learning_rate": 2.857142857142857e-05,
871
- "loss": 2.5024,
872
- "step": 121
873
- },
874
- {
875
- "epoch": 2.8421052631578947,
876
- "grad_norm": 2.4636147022247314,
877
- "learning_rate": 2.797619047619048e-05,
878
- "loss": 2.5213,
879
- "step": 122
880
- },
881
- {
882
- "epoch": 2.8654970760233915,
883
- "grad_norm": 2.4576895236968994,
884
- "learning_rate": 2.7380952380952383e-05,
885
- "loss": 2.4407,
886
- "step": 123
887
- },
888
- {
889
- "epoch": 2.888888888888889,
890
- "grad_norm": 2.4760570526123047,
891
- "learning_rate": 2.6785714285714288e-05,
892
- "loss": 2.5222,
893
- "step": 124
894
- },
895
- {
896
- "epoch": 2.912280701754386,
897
- "grad_norm": 1.9041963815689087,
898
- "learning_rate": 2.6190476190476192e-05,
899
- "loss": 2.497,
900
- "step": 125
901
- },
902
- {
903
- "epoch": 2.935672514619883,
904
- "grad_norm": 2.592594623565674,
905
- "learning_rate": 2.5595238095238093e-05,
906
- "loss": 2.4658,
907
- "step": 126
908
- },
909
- {
910
- "epoch": 2.95906432748538,
911
- "grad_norm": 1.9708219766616821,
912
- "learning_rate": 2.5e-05,
913
- "loss": 2.469,
914
- "step": 127
915
- },
916
- {
917
- "epoch": 2.982456140350877,
918
- "grad_norm": 2.22748064994812,
919
- "learning_rate": 2.4404761904761906e-05,
920
- "loss": 2.4454,
921
- "step": 128
922
- },
923
- {
924
- "epoch": 3.0,
925
- "grad_norm": 1.8958027362823486,
926
- "learning_rate": 2.380952380952381e-05,
927
- "loss": 1.8965,
928
- "step": 129
929
- },
930
- {
931
- "epoch": 3.0,
932
- "eval_loss": 0.3103340268135071,
933
- "eval_runtime": 3.4912,
934
- "eval_samples_per_second": 61.869,
935
- "eval_steps_per_second": 15.467,
936
- "step": 129
937
- },
938
- {
939
- "epoch": 3.023391812865497,
940
- "grad_norm": 1.7813217639923096,
941
- "learning_rate": 2.3214285714285715e-05,
942
- "loss": 2.4409,
943
- "step": 130
944
- },
945
- {
946
- "epoch": 3.046783625730994,
947
- "grad_norm": 1.6289114952087402,
948
- "learning_rate": 2.261904761904762e-05,
949
- "loss": 2.4332,
950
- "step": 131
951
- },
952
- {
953
- "epoch": 3.0701754385964914,
954
- "grad_norm": 1.4746161699295044,
955
- "learning_rate": 2.2023809523809524e-05,
956
- "loss": 2.4365,
957
- "step": 132
958
- },
959
- {
960
- "epoch": 3.0935672514619883,
961
- "grad_norm": 2.3188133239746094,
962
- "learning_rate": 2.1428571428571428e-05,
963
- "loss": 2.4408,
964
- "step": 133
965
- },
966
- {
967
- "epoch": 3.116959064327485,
968
- "grad_norm": 1.423759937286377,
969
- "learning_rate": 2.0833333333333336e-05,
970
- "loss": 2.3871,
971
- "step": 134
972
- },
973
- {
974
- "epoch": 3.1403508771929824,
975
- "grad_norm": 2.078610420227051,
976
- "learning_rate": 2.023809523809524e-05,
977
- "loss": 2.4912,
978
- "step": 135
979
- },
980
- {
981
- "epoch": 3.1637426900584797,
982
- "grad_norm": 1.9567757844924927,
983
- "learning_rate": 1.9642857142857145e-05,
984
- "loss": 2.4318,
985
- "step": 136
986
- },
987
- {
988
- "epoch": 3.1871345029239766,
989
- "grad_norm": 2.138343095779419,
990
- "learning_rate": 1.9047619047619046e-05,
991
- "loss": 2.4311,
992
- "step": 137
993
- },
994
- {
995
- "epoch": 3.2105263157894735,
996
- "grad_norm": 1.9541492462158203,
997
- "learning_rate": 1.8452380952380954e-05,
998
- "loss": 2.4532,
999
- "step": 138
1000
- },
1001
- {
1002
- "epoch": 3.2339181286549707,
1003
- "grad_norm": 1.905840277671814,
1004
- "learning_rate": 1.785714285714286e-05,
1005
- "loss": 2.4833,
1006
- "step": 139
1007
- },
1008
- {
1009
- "epoch": 3.257309941520468,
1010
- "grad_norm": 1.5300649404525757,
1011
- "learning_rate": 1.7261904761904763e-05,
1012
- "loss": 2.4629,
1013
- "step": 140
1014
- },
1015
- {
1016
- "epoch": 3.280701754385965,
1017
- "grad_norm": 1.8787633180618286,
1018
- "learning_rate": 1.6666666666666667e-05,
1019
- "loss": 2.4423,
1020
- "step": 141
1021
- },
1022
- {
1023
- "epoch": 3.3040935672514617,
1024
- "grad_norm": 1.9487767219543457,
1025
- "learning_rate": 1.6071428571428572e-05,
1026
- "loss": 2.4558,
1027
- "step": 142
1028
- },
1029
- {
1030
- "epoch": 3.327485380116959,
1031
- "grad_norm": 1.8227450847625732,
1032
- "learning_rate": 1.5476190476190476e-05,
1033
- "loss": 2.4575,
1034
- "step": 143
1035
- },
1036
- {
1037
- "epoch": 3.3508771929824563,
1038
- "grad_norm": 1.9631016254425049,
1039
- "learning_rate": 1.4880952380952381e-05,
1040
- "loss": 2.4492,
1041
- "step": 144
1042
- },
1043
- {
1044
- "epoch": 3.374269005847953,
1045
- "grad_norm": 1.6142218112945557,
1046
- "learning_rate": 1.4285714285714285e-05,
1047
- "loss": 2.4313,
1048
- "step": 145
1049
- },
1050
- {
1051
- "epoch": 3.39766081871345,
1052
- "grad_norm": 1.4569997787475586,
1053
- "learning_rate": 1.3690476190476192e-05,
1054
- "loss": 2.3885,
1055
- "step": 146
1056
- },
1057
- {
1058
- "epoch": 3.4210526315789473,
1059
- "grad_norm": 2.126253843307495,
1060
- "learning_rate": 1.3095238095238096e-05,
1061
- "loss": 2.4366,
1062
- "step": 147
1063
- },
1064
- {
1065
- "epoch": 3.4444444444444446,
1066
- "grad_norm": 1.3649730682373047,
1067
- "learning_rate": 1.25e-05,
1068
- "loss": 2.4278,
1069
- "step": 148
1070
- },
1071
- {
1072
- "epoch": 3.4678362573099415,
1073
- "grad_norm": 1.5861579179763794,
1074
- "learning_rate": 1.1904761904761905e-05,
1075
- "loss": 2.4695,
1076
- "step": 149
1077
- },
1078
- {
1079
- "epoch": 3.4912280701754383,
1080
- "grad_norm": 1.743173360824585,
1081
- "learning_rate": 1.130952380952381e-05,
1082
- "loss": 2.4422,
1083
- "step": 150
1084
- },
1085
- {
1086
- "epoch": 3.5146198830409356,
1087
- "grad_norm": 1.6894521713256836,
1088
- "learning_rate": 1.0714285714285714e-05,
1089
- "loss": 2.4343,
1090
- "step": 151
1091
- },
1092
- {
1093
- "epoch": 3.538011695906433,
1094
- "grad_norm": 1.4037004709243774,
1095
- "learning_rate": 1.011904761904762e-05,
1096
- "loss": 2.4197,
1097
- "step": 152
1098
- },
1099
- {
1100
- "epoch": 3.56140350877193,
1101
- "grad_norm": 2.512463092803955,
1102
- "learning_rate": 9.523809523809523e-06,
1103
- "loss": 2.4515,
1104
- "step": 153
1105
- },
1106
- {
1107
- "epoch": 3.5847953216374266,
1108
- "grad_norm": 1.968013048171997,
1109
- "learning_rate": 8.92857142857143e-06,
1110
- "loss": 2.4602,
1111
- "step": 154
1112
- },
1113
- {
1114
- "epoch": 3.608187134502924,
1115
- "grad_norm": 1.521202802658081,
1116
- "learning_rate": 8.333333333333334e-06,
1117
- "loss": 2.4167,
1118
- "step": 155
1119
- },
1120
- {
1121
- "epoch": 3.6315789473684212,
1122
- "grad_norm": 1.4753526449203491,
1123
- "learning_rate": 7.738095238095238e-06,
1124
- "loss": 2.4613,
1125
- "step": 156
1126
- },
1127
- {
1128
- "epoch": 3.654970760233918,
1129
- "grad_norm": 2.0607075691223145,
1130
- "learning_rate": 7.142857142857143e-06,
1131
- "loss": 2.4299,
1132
- "step": 157
1133
- },
1134
- {
1135
- "epoch": 3.678362573099415,
1136
- "grad_norm": 1.3630319833755493,
1137
- "learning_rate": 6.547619047619048e-06,
1138
- "loss": 2.4417,
1139
- "step": 158
1140
- },
1141
- {
1142
- "epoch": 3.7017543859649122,
1143
- "grad_norm": 1.2382704019546509,
1144
- "learning_rate": 5.9523809523809525e-06,
1145
- "loss": 2.43,
1146
- "step": 159
1147
- },
1148
- {
1149
- "epoch": 3.7251461988304095,
1150
- "grad_norm": 1.4374663829803467,
1151
- "learning_rate": 5.357142857142857e-06,
1152
- "loss": 2.4523,
1153
- "step": 160
1154
- },
1155
- {
1156
- "epoch": 3.7485380116959064,
1157
- "grad_norm": 1.6586753129959106,
1158
- "learning_rate": 4.7619047619047615e-06,
1159
- "loss": 2.4755,
1160
- "step": 161
1161
- },
1162
- {
1163
- "epoch": 3.7719298245614032,
1164
- "grad_norm": 1.7251605987548828,
1165
- "learning_rate": 4.166666666666667e-06,
1166
- "loss": 2.5089,
1167
- "step": 162
1168
- },
1169
- {
1170
- "epoch": 3.7953216374269005,
1171
- "grad_norm": 1.5588628053665161,
1172
- "learning_rate": 3.5714285714285714e-06,
1173
- "loss": 2.4195,
1174
- "step": 163
1175
- },
1176
- {
1177
- "epoch": 3.818713450292398,
1178
- "grad_norm": 1.722469449043274,
1179
- "learning_rate": 2.9761904761904763e-06,
1180
- "loss": 2.4104,
1181
- "step": 164
1182
- },
1183
- {
1184
- "epoch": 3.8421052631578947,
1185
- "grad_norm": 1.9132815599441528,
1186
- "learning_rate": 2.3809523809523808e-06,
1187
- "loss": 2.4492,
1188
- "step": 165
1189
- },
1190
- {
1191
- "epoch": 3.8654970760233915,
1192
- "grad_norm": 2.082366704940796,
1193
- "learning_rate": 1.7857142857142857e-06,
1194
- "loss": 2.4535,
1195
- "step": 166
1196
- },
1197
- {
1198
- "epoch": 3.888888888888889,
1199
- "grad_norm": 1.7362799644470215,
1200
- "learning_rate": 1.1904761904761904e-06,
1201
- "loss": 2.4908,
1202
- "step": 167
1203
- },
1204
- {
1205
- "epoch": 3.912280701754386,
1206
- "grad_norm": 1.6625697612762451,
1207
- "learning_rate": 5.952380952380952e-07,
1208
- "loss": 2.4552,
1209
- "step": 168
1210
- },
1211
- {
1212
- "epoch": 3.912280701754386,
1213
- "eval_loss": 0.3090362250804901,
1214
- "eval_runtime": 3.7281,
1215
- "eval_samples_per_second": 57.938,
1216
- "eval_steps_per_second": 14.484,
1217
- "step": 168
1218
  }
1219
  ],
1220
  "logging_steps": 1,
1221
- "max_steps": 168,
1222
  "num_input_tokens_seen": 0,
1223
- "num_train_epochs": 4,
1224
  "save_steps": 500,
1225
  "stateful_callbacks": {
1226
  "TrainerControl": {
@@ -1229,12 +330,12 @@
1229
  "should_evaluate": false,
1230
  "should_log": false,
1231
  "should_save": true,
1232
- "should_training_stop": true
1233
  },
1234
  "attributes": {}
1235
  }
1236
  },
1237
- "total_flos": 8416595999195136.0,
1238
  "train_batch_size": 12,
1239
  "trial_name": null,
1240
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.31598106026649475,
3
+ "best_model_checkpoint": "./opt_trained/checkpoint-43",
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 43,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.023391812865497075,
13
+ "grad_norm": 1.4262398481369019,
14
+ "learning_rate": 9.995238095238095e-05,
15
+ "loss": 2.4722,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.04678362573099415,
20
+ "grad_norm": 12.206507682800293,
21
+ "learning_rate": 9.990476190476191e-05,
22
+ "loss": 3.176,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.07017543859649122,
27
+ "grad_norm": 11.578840255737305,
28
+ "learning_rate": 9.985714285714287e-05,
29
+ "loss": 2.8396,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.0935672514619883,
34
+ "grad_norm": 9.648161888122559,
35
+ "learning_rate": 9.980952380952382e-05,
36
+ "loss": 2.7336,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.11695906432748537,
41
+ "grad_norm": 3.8782691955566406,
42
+ "learning_rate": 9.976190476190477e-05,
43
+ "loss": 2.6955,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.14035087719298245,
48
+ "grad_norm": 12.644487380981445,
49
+ "learning_rate": 9.971428571428571e-05,
50
+ "loss": 2.6405,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.16374269005847952,
55
+ "grad_norm": 11.057122230529785,
56
+ "learning_rate": 9.966666666666667e-05,
57
+ "loss": 2.7113,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.1871345029239766,
62
+ "grad_norm": 4.860190391540527,
63
+ "learning_rate": 9.961904761904762e-05,
64
+ "loss": 2.7076,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.21052631578947367,
69
+ "grad_norm": 4.317215442657471,
70
+ "learning_rate": 9.957142857142858e-05,
71
+ "loss": 2.6377,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.23391812865497075,
76
+ "grad_norm": 3.3068416118621826,
77
+ "learning_rate": 9.952380952380953e-05,
78
+ "loss": 2.5995,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.2573099415204678,
83
+ "grad_norm": 1.2752724885940552,
84
+ "learning_rate": 9.947619047619048e-05,
85
+ "loss": 2.6285,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.2807017543859649,
90
+ "grad_norm": 8.849737167358398,
91
+ "learning_rate": 9.942857142857144e-05,
92
+ "loss": 2.6217,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.30409356725146197,
97
+ "grad_norm": 5.594025611877441,
98
+ "learning_rate": 9.938095238095238e-05,
99
+ "loss": 2.6265,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.32748538011695905,
104
+ "grad_norm": 3.581617593765259,
105
+ "learning_rate": 9.933333333333334e-05,
106
+ "loss": 2.5984,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.3508771929824561,
111
+ "grad_norm": 5.33600378036499,
112
+ "learning_rate": 9.92857142857143e-05,
113
+ "loss": 2.6071,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.3742690058479532,
118
+ "grad_norm": 4.4274983406066895,
119
+ "learning_rate": 9.923809523809524e-05,
120
+ "loss": 2.5908,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.39766081871345027,
125
+ "grad_norm": 4.5507307052612305,
126
+ "learning_rate": 9.91904761904762e-05,
127
+ "loss": 2.5616,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.42105263157894735,
132
+ "grad_norm": 3.189161777496338,
133
+ "learning_rate": 9.914285714285715e-05,
134
+ "loss": 2.5784,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.4444444444444444,
139
+ "grad_norm": 2.706615924835205,
140
+ "learning_rate": 9.909523809523809e-05,
141
+ "loss": 2.5413,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.4678362573099415,
146
+ "grad_norm": 2.146662712097168,
147
+ "learning_rate": 9.904761904761905e-05,
148
+ "loss": 2.6019,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.49122807017543857,
153
+ "grad_norm": 3.2252964973449707,
154
+ "learning_rate": 9.900000000000001e-05,
155
+ "loss": 2.5914,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.5146198830409356,
160
+ "grad_norm": 2.8859879970550537,
161
+ "learning_rate": 9.895238095238095e-05,
162
+ "loss": 2.6133,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.5380116959064327,
167
+ "grad_norm": 2.5647897720336914,
168
+ "learning_rate": 9.890476190476191e-05,
169
+ "loss": 2.5425,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.5614035087719298,
174
+ "grad_norm": 3.0347073078155518,
175
+ "learning_rate": 9.885714285714286e-05,
176
+ "loss": 2.5732,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.5847953216374269,
181
+ "grad_norm": 1.8412858247756958,
182
+ "learning_rate": 9.880952380952381e-05,
183
+ "loss": 2.5776,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.6081871345029239,
188
+ "grad_norm": 3.1820366382598877,
189
+ "learning_rate": 9.876190476190477e-05,
190
+ "loss": 2.5566,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.631578947368421,
195
+ "grad_norm": 2.4613687992095947,
196
+ "learning_rate": 9.871428571428572e-05,
197
+ "loss": 2.5113,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.6549707602339181,
202
+ "grad_norm": 5.942374229431152,
203
+ "learning_rate": 9.866666666666668e-05,
204
+ "loss": 2.575,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.6783625730994152,
209
+ "grad_norm": 7.427689075469971,
210
+ "learning_rate": 9.861904761904762e-05,
211
+ "loss": 2.5692,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.7017543859649122,
216
+ "grad_norm": 3.163085699081421,
217
+ "learning_rate": 9.857142857142858e-05,
218
+ "loss": 2.5192,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.7251461988304093,
223
+ "grad_norm": 3.421778678894043,
224
+ "learning_rate": 9.852380952380952e-05,
225
+ "loss": 2.5533,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.7485380116959064,
230
+ "grad_norm": 2.486320734024048,
231
+ "learning_rate": 9.847619047619048e-05,
232
+ "loss": 2.515,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.7719298245614035,
237
+ "grad_norm": 3.2825722694396973,
238
+ "learning_rate": 9.842857142857144e-05,
239
+ "loss": 2.4994,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.7953216374269005,
244
+ "grad_norm": 3.5643672943115234,
245
+ "learning_rate": 9.838095238095238e-05,
246
+ "loss": 2.5327,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.8187134502923976,
251
+ "grad_norm": 3.3127200603485107,
252
+ "learning_rate": 9.833333333333333e-05,
253
+ "loss": 2.6251,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.8421052631578947,
258
+ "grad_norm": 3.187095880508423,
259
+ "learning_rate": 9.828571428571429e-05,
260
+ "loss": 2.5233,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.8654970760233918,
265
+ "grad_norm": 3.3743860721588135,
266
+ "learning_rate": 9.823809523809525e-05,
267
+ "loss": 2.4999,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.8888888888888888,
272
+ "grad_norm": 2.3684120178222656,
273
+ "learning_rate": 9.81904761904762e-05,
274
+ "loss": 2.5302,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.9122807017543859,
279
+ "grad_norm": 3.7091619968414307,
280
+ "learning_rate": 9.814285714285715e-05,
281
+ "loss": 2.5003,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.935672514619883,
286
+ "grad_norm": 4.230418682098389,
287
+ "learning_rate": 9.80952380952381e-05,
288
+ "loss": 2.5379,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.9590643274853801,
293
+ "grad_norm": 3.5879616737365723,
294
+ "learning_rate": 9.804761904761905e-05,
295
+ "loss": 2.5652,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.9824561403508771,
300
+ "grad_norm": 2.621013879776001,
301
+ "learning_rate": 9.8e-05,
302
+ "loss": 2.5704,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 1.0,
307
+ "grad_norm": 1.3536667823791504,
308
+ "learning_rate": 9.795238095238097e-05,
309
  "loss": 1.9135,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 1.0,
314
+ "eval_loss": 0.31598106026649475,
315
+ "eval_runtime": 3.4639,
316
+ "eval_samples_per_second": 62.358,
317
+ "eval_steps_per_second": 15.59,
318
  "step": 43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  }
320
  ],
321
  "logging_steps": 1,
322
+ "max_steps": 2100,
323
  "num_input_tokens_seen": 0,
324
+ "num_train_epochs": 50,
325
  "save_steps": 500,
326
  "stateful_callbacks": {
327
  "TrainerControl": {
 
330
  "should_evaluate": false,
331
  "should_log": false,
332
  "should_save": true,
333
+ "should_training_stop": false
334
  },
335
  "attributes": {}
336
  }
337
  },
338
+ "total_flos": 2151327228493824.0,
339
  "train_batch_size": 12,
340
  "trial_name": null,
341
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:228b31a33cdb89916f0345448bcc71416a768a57a127423492ac7dd71507ccc9
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01ee0c77a84c59226be8580dfbe625c82d72bc4456ab0e2ca0a40baf78f82e26
3
  size 5368