error577 commited on
Commit
de7d593
·
verified ·
1 Parent(s): 738f932

Training in progress, step 128, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
- "up_proj",
25
  "o_proj",
 
26
  "down_proj",
27
- "gate_proj",
28
- "v_proj",
29
- "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "v_proj",
24
+ "gate_proj",
25
  "o_proj",
26
+ "q_proj",
27
  "down_proj",
28
+ "up_proj",
29
+ "k_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3f0f330c5dd30037710dfbf893b79cb29436d7605c331600e080ee1b5bd14a2
3
  size 30026872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7322ded38f1358b662c50901dee8db86ae07a82ed276f247e3fbb9405b578e8e
3
  size 30026872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:983fbe2f79308844735f65dc93021a51a40cf90eac2b77c3a25f9f52ddee984f
3
- size 15611732
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1884551d287fac52bc2af7c72494b16eaa64445a199fd9e4ed86867614e2050f
3
+ size 15611412
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fba5f66aab213b0fa00c180e93d69724a4ae021a159f7a32b7dab809d308f684
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a011d66fd370c631e51fd7ddaab9d7f56fbcfd5ad504dcd5d16420e7872fa9a2
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:947fde7ad1e89d6c6f4bd00098aa9fd81cc5bfcb34ae5bce9843e205478613e9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59fc2fc920faa440c3e3d0115925487d3c9ed02c8ad062660ac1db63d6a8c0fe
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6365372374283895,
5
- "eval_steps": 250,
6
- "global_step": 375,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,550 +11,224 @@
11
  {
12
  "epoch": 0.001697432633142372,
13
  "eval_loss": 2.4177019596099854,
14
- "eval_runtime": 13.943,
15
- "eval_samples_per_second": 17.858,
16
- "eval_steps_per_second": 17.858,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.008487163165711862,
21
- "grad_norm": 0.5102115273475647,
22
  "learning_rate": 5e-06,
23
- "loss": 2.0834,
24
  "step": 5
25
  },
26
  {
27
  "epoch": 0.016974326331423723,
28
- "grad_norm": 0.5134205222129822,
29
  "learning_rate": 1e-05,
30
  "loss": 2.3638,
31
  "step": 10
32
  },
33
  {
34
  "epoch": 0.025461489497135583,
35
- "grad_norm": 0.5788155198097229,
36
  "learning_rate": 9.99743108100344e-06,
37
- "loss": 2.3434,
38
  "step": 15
39
  },
40
  {
41
  "epoch": 0.033948652662847446,
42
- "grad_norm": 0.7257423400878906,
43
  "learning_rate": 9.989726963751683e-06,
44
- "loss": 2.4883,
45
  "step": 20
46
  },
47
  {
48
  "epoch": 0.042435815828559306,
49
- "grad_norm": 0.6675796508789062,
50
  "learning_rate": 9.976895564745993e-06,
51
- "loss": 2.2916,
52
  "step": 25
53
  },
54
  {
55
  "epoch": 0.050922978994271166,
56
- "grad_norm": 0.42739933729171753,
57
  "learning_rate": 9.95895006911623e-06,
58
- "loss": 2.8234,
59
  "step": 30
60
  },
 
 
 
 
 
 
 
 
61
  {
62
  "epoch": 0.059410142159983026,
63
- "grad_norm": 0.5156223177909851,
64
  "learning_rate": 9.935908917072253e-06,
65
- "loss": 2.3804,
66
  "step": 35
67
  },
68
  {
69
  "epoch": 0.06789730532569489,
70
- "grad_norm": 0.8240678310394287,
71
  "learning_rate": 9.907795784955327e-06,
72
- "loss": 2.3095,
73
  "step": 40
74
  },
75
  {
76
  "epoch": 0.07638446849140675,
77
- "grad_norm": 0.7630073428153992,
78
  "learning_rate": 9.874639560909118e-06,
79
- "loss": 2.2891,
80
  "step": 45
81
  },
82
  {
83
  "epoch": 0.08487163165711861,
84
- "grad_norm": 0.8923418521881104,
85
  "learning_rate": 9.836474315195148e-06,
86
- "loss": 2.2642,
87
  "step": 50
88
  },
89
  {
90
  "epoch": 0.09335879482283047,
91
- "grad_norm": 0.7908952236175537,
92
  "learning_rate": 9.793339265183303e-06,
93
- "loss": 2.4782,
94
  "step": 55
95
  },
96
  {
97
  "epoch": 0.10184595798854233,
98
- "grad_norm": 0.8752570748329163,
99
  "learning_rate": 9.745278735053345e-06,
100
- "loss": 2.2442,
101
  "step": 60
102
  },
 
 
 
 
 
 
 
 
103
  {
104
  "epoch": 0.11033312115425419,
105
- "grad_norm": 0.8446648120880127,
106
  "learning_rate": 9.692342110248802e-06,
107
- "loss": 2.3623,
108
  "step": 65
109
  },
110
  {
111
  "epoch": 0.11882028431996605,
112
- "grad_norm": 0.8395955562591553,
113
  "learning_rate": 9.63458378673011e-06,
114
- "loss": 2.2078,
115
  "step": 70
116
  },
117
  {
118
  "epoch": 0.1273074474856779,
119
- "grad_norm": 0.96536785364151,
120
  "learning_rate": 9.572063115079063e-06,
121
- "loss": 2.3043,
122
  "step": 75
123
  },
124
  {
125
  "epoch": 0.13579461065138979,
126
- "grad_norm": 0.9060677289962769,
127
  "learning_rate": 9.504844339512096e-06,
128
- "loss": 2.4284,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 0.14428177381710164,
133
- "grad_norm": 0.6167080402374268,
134
  "learning_rate": 9.432996531865001e-06,
135
- "loss": 2.2063,
136
  "step": 85
137
  },
138
  {
139
  "epoch": 0.1527689369828135,
140
- "grad_norm": 1.3145157098770142,
141
  "learning_rate": 9.356593520616948e-06,
142
- "loss": 2.4146,
143
  "step": 90
144
  },
145
  {
146
  "epoch": 0.16125610014852534,
147
- "grad_norm": 1.06257164478302,
148
  "learning_rate": 9.275713815026732e-06,
149
- "loss": 2.236,
150
  "step": 95
151
  },
 
 
 
 
 
 
 
 
152
  {
153
  "epoch": 0.16974326331423722,
154
- "grad_norm": 0.8050308227539062,
155
  "learning_rate": 9.190440524459203e-06,
156
- "loss": 2.5021,
157
  "step": 100
158
  },
159
  {
160
  "epoch": 0.17823042647994908,
161
- "grad_norm": 0.9117026925086975,
162
  "learning_rate": 9.10086127298478e-06,
163
- "loss": 2.243,
164
  "step": 105
165
  },
166
  {
167
  "epoch": 0.18671758964566093,
168
- "grad_norm": 0.7011638879776001,
169
  "learning_rate": 9.007068109339783e-06,
170
- "loss": 2.2535,
171
  "step": 110
172
  },
173
  {
174
  "epoch": 0.1952047528113728,
175
- "grad_norm": 0.6629008650779724,
176
  "learning_rate": 8.90915741234015e-06,
177
- "loss": 2.0724,
178
  "step": 115
179
  },
180
  {
181
  "epoch": 0.20369191597708466,
182
- "grad_norm": 0.6800772547721863,
183
  "learning_rate": 8.807229791845673e-06,
184
- "loss": 2.3102,
185
  "step": 120
186
  },
187
  {
188
  "epoch": 0.21217907914279652,
189
- "grad_norm": 0.7317128777503967,
190
  "learning_rate": 8.701389985376578e-06,
191
- "loss": 2.2061,
192
  "step": 125
193
  },
194
  {
195
- "epoch": 0.22066624230850837,
196
- "grad_norm": 0.7562074661254883,
197
- "learning_rate": 8.591746750488639e-06,
198
- "loss": 2.1027,
199
- "step": 130
200
- },
201
- {
202
- "epoch": 0.22915340547422025,
203
- "grad_norm": 1.395151972770691,
204
- "learning_rate": 8.478412753017433e-06,
205
- "loss": 2.2459,
206
- "step": 135
207
- },
208
- {
209
- "epoch": 0.2376405686399321,
210
- "grad_norm": 1.0409704446792603,
211
- "learning_rate": 8.361504451306585e-06,
212
- "loss": 2.0757,
213
- "step": 140
214
- },
215
- {
216
- "epoch": 0.24612773180564396,
217
- "grad_norm": 0.7646152973175049,
218
- "learning_rate": 8.241141976538944e-06,
219
- "loss": 2.1194,
220
- "step": 145
221
- },
222
- {
223
- "epoch": 0.2546148949713558,
224
- "grad_norm": 0.6315305829048157,
225
- "learning_rate": 8.117449009293668e-06,
226
- "loss": 2.029,
227
- "step": 150
228
- },
229
- {
230
- "epoch": 0.26310205813706766,
231
- "grad_norm": 0.8723805546760559,
232
- "learning_rate": 7.99055265245608e-06,
233
- "loss": 2.3872,
234
- "step": 155
235
- },
236
- {
237
- "epoch": 0.27158922130277957,
238
- "grad_norm": 0.912027895450592,
239
- "learning_rate": 7.860583300610849e-06,
240
- "loss": 2.0692,
241
- "step": 160
242
- },
243
- {
244
- "epoch": 0.2800763844684914,
245
- "grad_norm": 1.1273324489593506,
246
- "learning_rate": 7.727674506052744e-06,
247
- "loss": 1.9747,
248
- "step": 165
249
- },
250
- {
251
- "epoch": 0.2885635476342033,
252
- "grad_norm": 0.8349745869636536,
253
- "learning_rate": 7.591962841552627e-06,
254
- "loss": 1.9815,
255
- "step": 170
256
- },
257
- {
258
- "epoch": 0.29705071079991513,
259
- "grad_norm": 0.6559799313545227,
260
- "learning_rate": 7.453587760019691e-06,
261
- "loss": 2.1225,
262
- "step": 175
263
- },
264
- {
265
- "epoch": 0.305537873965627,
266
- "grad_norm": 0.935215413570404,
267
- "learning_rate": 7.312691451204178e-06,
268
- "loss": 2.1128,
269
- "step": 180
270
- },
271
- {
272
- "epoch": 0.31402503713133884,
273
- "grad_norm": 0.6984186768531799,
274
- "learning_rate": 7.169418695587791e-06,
275
- "loss": 2.4616,
276
- "step": 185
277
- },
278
- {
279
- "epoch": 0.3225122002970507,
280
- "grad_norm": 0.8212848901748657,
281
- "learning_rate": 7.023916715611969e-06,
282
- "loss": 2.132,
283
- "step": 190
284
- },
285
- {
286
- "epoch": 0.3309993634627626,
287
- "grad_norm": 0.9089066386222839,
288
- "learning_rate": 6.876335024396872e-06,
289
- "loss": 2.0719,
290
- "step": 195
291
- },
292
- {
293
- "epoch": 0.33948652662847445,
294
- "grad_norm": 0.7070319056510925,
295
- "learning_rate": 6.726825272106539e-06,
296
- "loss": 2.245,
297
- "step": 200
298
- },
299
- {
300
- "epoch": 0.3479736897941863,
301
- "grad_norm": 1.1449453830718994,
302
- "learning_rate": 6.575541090118105e-06,
303
- "loss": 2.2291,
304
- "step": 205
305
- },
306
- {
307
- "epoch": 0.35646085295989816,
308
- "grad_norm": 0.7681276798248291,
309
- "learning_rate": 6.4226379331551625e-06,
310
- "loss": 2.0973,
311
- "step": 210
312
- },
313
- {
314
- "epoch": 0.36494801612561,
315
- "grad_norm": 1.284940242767334,
316
- "learning_rate": 6.268272919547537e-06,
317
- "loss": 2.314,
318
- "step": 215
319
- },
320
- {
321
- "epoch": 0.37343517929132186,
322
- "grad_norm": 1.1216720342636108,
323
- "learning_rate": 6.112604669781572e-06,
324
- "loss": 2.076,
325
- "step": 220
326
- },
327
- {
328
- "epoch": 0.3819223424570337,
329
- "grad_norm": 0.9181829690933228,
330
- "learning_rate": 5.955793143506863e-06,
331
- "loss": 2.2251,
332
- "step": 225
333
- },
334
- {
335
- "epoch": 0.3904095056227456,
336
- "grad_norm": 0.9240125417709351,
337
- "learning_rate": 5.797999475166897e-06,
338
- "loss": 2.0162,
339
- "step": 230
340
- },
341
- {
342
- "epoch": 0.3988966687884575,
343
- "grad_norm": 0.9688006639480591,
344
- "learning_rate": 5.6393858084225305e-06,
345
- "loss": 1.9567,
346
- "step": 235
347
- },
348
- {
349
- "epoch": 0.40738383195416933,
350
- "grad_norm": 0.6780016422271729,
351
- "learning_rate": 5.480115129538409e-06,
352
- "loss": 2.2013,
353
- "step": 240
354
- },
355
- {
356
- "epoch": 0.4158709951198812,
357
- "grad_norm": 1.0788044929504395,
358
- "learning_rate": 5.320351099903565e-06,
359
- "loss": 2.0739,
360
- "step": 245
361
- },
362
- {
363
- "epoch": 0.42435815828559303,
364
- "grad_norm": 1.0367454290390015,
365
- "learning_rate": 5.160257887858278e-06,
366
- "loss": 1.9629,
367
- "step": 250
368
- },
369
- {
370
- "epoch": 0.42435815828559303,
371
- "eval_loss": 2.1809167861938477,
372
- "eval_runtime": 15.1521,
373
- "eval_samples_per_second": 16.433,
374
- "eval_steps_per_second": 16.433,
375
- "step": 250
376
- },
377
- {
378
- "epoch": 0.4328453214513049,
379
- "grad_norm": 0.9742569923400879,
380
- "learning_rate": 5e-06,
381
- "loss": 2.1209,
382
- "step": 255
383
- },
384
- {
385
- "epoch": 0.44133248461701674,
386
- "grad_norm": 1.4076533317565918,
387
- "learning_rate": 4.839742112141725e-06,
388
- "loss": 2.1278,
389
- "step": 260
390
- },
391
- {
392
- "epoch": 0.44981964778272865,
393
- "grad_norm": 0.9100192785263062,
394
- "learning_rate": 4.679648900096436e-06,
395
- "loss": 2.2103,
396
- "step": 265
397
- },
398
- {
399
- "epoch": 0.4583068109484405,
400
- "grad_norm": 1.089735746383667,
401
- "learning_rate": 4.5198848704615915e-06,
402
- "loss": 2.196,
403
- "step": 270
404
- },
405
- {
406
- "epoch": 0.46679397411415235,
407
- "grad_norm": 1.1712334156036377,
408
- "learning_rate": 4.3606141915774695e-06,
409
- "loss": 2.0448,
410
- "step": 275
411
- },
412
- {
413
- "epoch": 0.4752811372798642,
414
- "grad_norm": 0.9530436992645264,
415
- "learning_rate": 4.2020005248331056e-06,
416
- "loss": 2.0857,
417
- "step": 280
418
- },
419
- {
420
- "epoch": 0.48376830044557606,
421
- "grad_norm": 0.9787421226501465,
422
- "learning_rate": 4.04420685649314e-06,
423
- "loss": 2.1471,
424
- "step": 285
425
- },
426
- {
427
- "epoch": 0.4922554636112879,
428
- "grad_norm": 1.1793596744537354,
429
- "learning_rate": 3.887395330218429e-06,
430
- "loss": 2.0992,
431
- "step": 290
432
- },
433
- {
434
- "epoch": 0.5007426267769998,
435
- "grad_norm": 1.0962886810302734,
436
- "learning_rate": 3.731727080452464e-06,
437
- "loss": 1.9355,
438
- "step": 295
439
- },
440
- {
441
- "epoch": 0.5092297899427116,
442
- "grad_norm": 0.9063106179237366,
443
- "learning_rate": 3.5773620668448384e-06,
444
- "loss": 2.0345,
445
- "step": 300
446
- },
447
- {
448
- "epoch": 0.5177169531084235,
449
- "grad_norm": 0.955984354019165,
450
- "learning_rate": 3.424458909881897e-06,
451
- "loss": 2.0773,
452
- "step": 305
453
- },
454
- {
455
- "epoch": 0.5262041162741353,
456
- "grad_norm": 0.8287053108215332,
457
- "learning_rate": 3.273174727893463e-06,
458
- "loss": 2.0267,
459
- "step": 310
460
- },
461
- {
462
- "epoch": 0.5346912794398472,
463
- "grad_norm": 1.228607416152954,
464
- "learning_rate": 3.12366497560313e-06,
465
- "loss": 2.1409,
466
- "step": 315
467
- },
468
- {
469
- "epoch": 0.5431784426055591,
470
- "grad_norm": 0.823341965675354,
471
- "learning_rate": 2.976083284388031e-06,
472
- "loss": 2.4237,
473
- "step": 320
474
- },
475
- {
476
- "epoch": 0.551665605771271,
477
- "grad_norm": 1.032359004020691,
478
- "learning_rate": 2.83058130441221e-06,
479
- "loss": 2.2527,
480
- "step": 325
481
- },
482
- {
483
- "epoch": 0.5601527689369828,
484
- "grad_norm": 1.1261106729507446,
485
- "learning_rate": 2.687308548795825e-06,
486
- "loss": 2.0518,
487
- "step": 330
488
- },
489
- {
490
- "epoch": 0.5686399321026947,
491
- "grad_norm": 1.1350051164627075,
492
- "learning_rate": 2.5464122399803126e-06,
493
- "loss": 2.02,
494
- "step": 335
495
- },
496
- {
497
- "epoch": 0.5771270952684066,
498
- "grad_norm": 0.5946178436279297,
499
- "learning_rate": 2.408037158447375e-06,
500
- "loss": 2.3448,
501
- "step": 340
502
- },
503
- {
504
- "epoch": 0.5856142584341184,
505
- "grad_norm": 0.8627076148986816,
506
- "learning_rate": 2.272325493947257e-06,
507
- "loss": 2.1151,
508
- "step": 345
509
- },
510
- {
511
- "epoch": 0.5941014215998303,
512
- "grad_norm": 0.7735455632209778,
513
- "learning_rate": 2.139416699389153e-06,
514
- "loss": 2.2088,
515
- "step": 350
516
- },
517
- {
518
- "epoch": 0.6025885847655421,
519
- "grad_norm": 1.1364538669586182,
520
- "learning_rate": 2.00944734754392e-06,
521
- "loss": 2.2205,
522
- "step": 355
523
- },
524
- {
525
- "epoch": 0.611075747931254,
526
- "grad_norm": 0.9770228862762451,
527
- "learning_rate": 1.8825509907063328e-06,
528
- "loss": 2.0885,
529
- "step": 360
530
- },
531
- {
532
- "epoch": 0.6195629110969658,
533
- "grad_norm": 1.0555680990219116,
534
- "learning_rate": 1.7588580234610592e-06,
535
- "loss": 1.9825,
536
- "step": 365
537
- },
538
- {
539
- "epoch": 0.6280500742626777,
540
- "grad_norm": 1.0218920707702637,
541
- "learning_rate": 1.6384955486934157e-06,
542
- "loss": 2.1847,
543
- "step": 370
544
- },
545
- {
546
- "epoch": 0.6365372374283895,
547
- "grad_norm": 1.1606744527816772,
548
- "learning_rate": 1.5215872469825682e-06,
549
- "loss": 2.0574,
550
- "step": 375
551
  }
552
  ],
553
  "logging_steps": 5,
554
  "max_steps": 500,
555
  "num_input_tokens_seen": 0,
556
  "num_train_epochs": 1,
557
- "save_steps": 125,
558
  "stateful_callbacks": {
559
  "TrainerControl": {
560
  "args": {
@@ -567,7 +241,7 @@
567
  "attributes": {}
568
  }
569
  },
570
- "total_flos": 1.190802534432768e+16,
571
  "train_batch_size": 1,
572
  "trial_name": null,
573
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.21727137704222363,
5
+ "eval_steps": 32,
6
+ "global_step": 128,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.001697432633142372,
13
  "eval_loss": 2.4177019596099854,
14
+ "eval_runtime": 13.7299,
15
+ "eval_samples_per_second": 18.136,
16
+ "eval_steps_per_second": 18.136,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.008487163165711862,
21
+ "grad_norm": 0.5580189228057861,
22
  "learning_rate": 5e-06,
23
+ "loss": 2.0835,
24
  "step": 5
25
  },
26
  {
27
  "epoch": 0.016974326331423723,
28
+ "grad_norm": 0.5617932081222534,
29
  "learning_rate": 1e-05,
30
  "loss": 2.3638,
31
  "step": 10
32
  },
33
  {
34
  "epoch": 0.025461489497135583,
35
+ "grad_norm": 0.637174129486084,
36
  "learning_rate": 9.99743108100344e-06,
37
+ "loss": 2.3443,
38
  "step": 15
39
  },
40
  {
41
  "epoch": 0.033948652662847446,
42
+ "grad_norm": 0.7906777858734131,
43
  "learning_rate": 9.989726963751683e-06,
44
+ "loss": 2.4875,
45
  "step": 20
46
  },
47
  {
48
  "epoch": 0.042435815828559306,
49
+ "grad_norm": 0.7220119833946228,
50
  "learning_rate": 9.976895564745993e-06,
51
+ "loss": 2.2905,
52
  "step": 25
53
  },
54
  {
55
  "epoch": 0.050922978994271166,
56
+ "grad_norm": 0.4569860100746155,
57
  "learning_rate": 9.95895006911623e-06,
58
+ "loss": 2.8207,
59
  "step": 30
60
  },
61
+ {
62
+ "epoch": 0.05431784426055591,
63
+ "eval_loss": 2.396263599395752,
64
+ "eval_runtime": 13.7534,
65
+ "eval_samples_per_second": 18.105,
66
+ "eval_steps_per_second": 18.105,
67
+ "step": 32
68
+ },
69
  {
70
  "epoch": 0.059410142159983026,
71
+ "grad_norm": 0.5562223196029663,
72
  "learning_rate": 9.935908917072253e-06,
73
+ "loss": 2.3774,
74
  "step": 35
75
  },
76
  {
77
  "epoch": 0.06789730532569489,
78
+ "grad_norm": 0.8851813077926636,
79
  "learning_rate": 9.907795784955327e-06,
80
+ "loss": 2.3059,
81
  "step": 40
82
  },
83
  {
84
  "epoch": 0.07638446849140675,
85
+ "grad_norm": 0.8263425827026367,
86
  "learning_rate": 9.874639560909118e-06,
87
+ "loss": 2.2858,
88
  "step": 45
89
  },
90
  {
91
  "epoch": 0.08487163165711861,
92
+ "grad_norm": 0.9496198296546936,
93
  "learning_rate": 9.836474315195148e-06,
94
+ "loss": 2.2606,
95
  "step": 50
96
  },
97
  {
98
  "epoch": 0.09335879482283047,
99
+ "grad_norm": 0.8389888405799866,
100
  "learning_rate": 9.793339265183303e-06,
101
+ "loss": 2.4757,
102
  "step": 55
103
  },
104
  {
105
  "epoch": 0.10184595798854233,
106
+ "grad_norm": 0.9090803861618042,
107
  "learning_rate": 9.745278735053345e-06,
108
+ "loss": 2.2428,
109
  "step": 60
110
  },
111
+ {
112
+ "epoch": 0.10863568852111181,
113
+ "eval_loss": 2.3315982818603516,
114
+ "eval_runtime": 13.995,
115
+ "eval_samples_per_second": 17.792,
116
+ "eval_steps_per_second": 17.792,
117
+ "step": 64
118
+ },
119
  {
120
  "epoch": 0.11033312115425419,
121
+ "grad_norm": 0.8944710493087769,
122
  "learning_rate": 9.692342110248802e-06,
123
+ "loss": 2.361,
124
  "step": 65
125
  },
126
  {
127
  "epoch": 0.11882028431996605,
128
+ "grad_norm": 0.8705277442932129,
129
  "learning_rate": 9.63458378673011e-06,
130
+ "loss": 2.2061,
131
  "step": 70
132
  },
133
  {
134
  "epoch": 0.1273074474856779,
135
+ "grad_norm": 1.0183981657028198,
136
  "learning_rate": 9.572063115079063e-06,
137
+ "loss": 2.3014,
138
  "step": 75
139
  },
140
  {
141
  "epoch": 0.13579461065138979,
142
+ "grad_norm": 0.9694010615348816,
143
  "learning_rate": 9.504844339512096e-06,
144
+ "loss": 2.4273,
145
  "step": 80
146
  },
147
  {
148
  "epoch": 0.14428177381710164,
149
+ "grad_norm": 0.6600094437599182,
150
  "learning_rate": 9.432996531865001e-06,
151
+ "loss": 2.2039,
152
  "step": 85
153
  },
154
  {
155
  "epoch": 0.1527689369828135,
156
+ "grad_norm": 1.437016487121582,
157
  "learning_rate": 9.356593520616948e-06,
158
+ "loss": 2.4129,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.16125610014852534,
163
+ "grad_norm": 1.1358604431152344,
164
  "learning_rate": 9.275713815026732e-06,
165
+ "loss": 2.2346,
166
  "step": 95
167
  },
168
+ {
169
+ "epoch": 0.16295353278166771,
170
+ "eval_loss": 2.2801592350006104,
171
+ "eval_runtime": 14.3531,
172
+ "eval_samples_per_second": 17.348,
173
+ "eval_steps_per_second": 17.348,
174
+ "step": 96
175
+ },
176
  {
177
  "epoch": 0.16974326331423722,
178
+ "grad_norm": 0.8347494006156921,
179
  "learning_rate": 9.190440524459203e-06,
180
+ "loss": 2.5003,
181
  "step": 100
182
  },
183
  {
184
  "epoch": 0.17823042647994908,
185
+ "grad_norm": 0.9528422355651855,
186
  "learning_rate": 9.10086127298478e-06,
187
+ "loss": 2.2398,
188
  "step": 105
189
  },
190
  {
191
  "epoch": 0.18671758964566093,
192
+ "grad_norm": 0.7451781630516052,
193
  "learning_rate": 9.007068109339783e-06,
194
+ "loss": 2.253,
195
  "step": 110
196
  },
197
  {
198
  "epoch": 0.1952047528113728,
199
+ "grad_norm": 0.6891763210296631,
200
  "learning_rate": 8.90915741234015e-06,
201
+ "loss": 2.0703,
202
  "step": 115
203
  },
204
  {
205
  "epoch": 0.20369191597708466,
206
+ "grad_norm": 0.7363041639328003,
207
  "learning_rate": 8.807229791845673e-06,
208
+ "loss": 2.3083,
209
  "step": 120
210
  },
211
  {
212
  "epoch": 0.21217907914279652,
213
+ "grad_norm": 0.7747501730918884,
214
  "learning_rate": 8.701389985376578e-06,
215
+ "loss": 2.2058,
216
  "step": 125
217
  },
218
  {
219
+ "epoch": 0.21727137704222363,
220
+ "eval_loss": 2.245945692062378,
221
+ "eval_runtime": 14.3131,
222
+ "eval_samples_per_second": 17.397,
223
+ "eval_steps_per_second": 17.397,
224
+ "step": 128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  }
226
  ],
227
  "logging_steps": 5,
228
  "max_steps": 500,
229
  "num_input_tokens_seen": 0,
230
  "num_train_epochs": 1,
231
+ "save_steps": 16,
232
  "stateful_callbacks": {
233
  "TrainerControl": {
234
  "args": {
 
241
  "attributes": {}
242
  }
243
  },
244
+ "total_flos": 4073485919846400.0,
245
  "train_batch_size": 1,
246
  "trial_name": null,
247
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38f78d4323ef233534399aab0b37c291dc3254cb806a9a78ab7e35d1f4316915
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e902904bbfeafcf7200b04696313449326cdab359ba6c8339db9eada6e4a62e
3
  size 6776