cwaud commited on
Commit
fbd3bf3
·
verified ·
1 Parent(s): d7f431b

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: unsloth/Llama-3.2-1B-Instruct
3
  library_name: peft
4
  ---
5
 
 
1
  ---
2
+ base_model: unsloth/Llama-3.2-3B-Instruct
3
  library_name: peft
4
  ---
5
 
last-checkpoint/adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "unsloth/Llama-3.2-1B-Instruct",
5
  "bias": "none",
6
  "fan_in_fan_out": null,
7
  "inference_mode": true,
@@ -20,12 +20,12 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "v_proj",
25
- "gate_proj",
26
  "k_proj",
 
27
  "up_proj",
28
  "o_proj",
 
 
29
  "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct",
5
  "bias": "none",
6
  "fan_in_fan_out": null,
7
  "inference_mode": true,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
 
23
  "k_proj",
24
+ "gate_proj",
25
  "up_proj",
26
  "o_proj",
27
+ "v_proj",
28
+ "q_proj",
29
  "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c3f769ff1e4ba4c98c03a624ed01a33e854004d0f07165933ad20c79838391d
3
- size 45118424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef850e00f6ddccf7a92baced38593af32bacbc7deb92d8d8407aa4aa0408de1a
3
+ size 97307544
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c89e8e9eeced7aa8ba2c881e43808e892dc2bfda03265131865bd0d70d2f6ac2
3
- size 90365754
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1bc63593d20de9621903470fe84da0c3845eac9f62fcf47fa7fc53c31b55e0a
3
+ size 194840426
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15bd35e71399de8fb4118025c15f8deb2ca06276b3bf83802724255e61131eb7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9985d87c9fe1230092403fc0c5f05d287ced3986c4900ee14ddc69775bac2f24
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:340e82764c0efd5e9e8cadc3d39d5cf7050cd6012718448008af049ab8a6b827
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3057cc3055b088ac745445a3a938968f2d198c20a2487c1a93787006959980
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1122 +1,390 @@
1
  {
2
- "best_metric": 2.570774555206299,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-150",
4
- "epoch": 0.037460198539052254,
5
  "eval_steps": 25,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.00024973465692701506,
13
- "grad_norm": 1.0262023210525513,
14
- "learning_rate": 2.8571428571428573e-06,
15
- "loss": 3.2239,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.00024973465692701506,
20
- "eval_loss": 3.366023302078247,
21
- "eval_runtime": 2.886,
22
- "eval_samples_per_second": 17.325,
23
- "eval_steps_per_second": 2.772,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.0004994693138540301,
28
- "grad_norm": 0.8814900517463684,
29
- "learning_rate": 5.7142857142857145e-06,
30
- "loss": 3.2252,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.0007492039707810451,
35
- "grad_norm": 1.10072660446167,
36
- "learning_rate": 8.571428571428573e-06,
37
- "loss": 3.339,
38
  "step": 3
39
  },
40
  {
41
- "epoch": 0.0009989386277080602,
42
- "grad_norm": 1.023310661315918,
43
- "learning_rate": 1.1428571428571429e-05,
44
- "loss": 3.2591,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 0.0012486732846350753,
49
- "grad_norm": 1.0511233806610107,
50
- "learning_rate": 1.4285714285714285e-05,
51
- "loss": 3.3003,
52
  "step": 5
53
  },
54
  {
55
- "epoch": 0.0014984079415620902,
56
- "grad_norm": 0.9858811497688293,
57
- "learning_rate": 1.7142857142857145e-05,
58
- "loss": 3.2781,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 0.0017481425984891053,
63
- "grad_norm": 1.1659033298492432,
64
- "learning_rate": 2e-05,
65
- "loss": 3.2301,
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.0019978772554161204,
70
- "grad_norm": 1.0043777227401733,
71
- "learning_rate": 2.2857142857142858e-05,
72
- "loss": 3.1818,
73
  "step": 8
74
  },
75
  {
76
- "epoch": 0.0022476119123431356,
77
- "grad_norm": 1.0616182088851929,
78
- "learning_rate": 2.5714285714285714e-05,
79
- "loss": 3.2333,
80
  "step": 9
81
  },
82
  {
83
- "epoch": 0.0024973465692701507,
84
- "grad_norm": 1.107161521911621,
85
- "learning_rate": 2.857142857142857e-05,
86
- "loss": 3.3466,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 0.0027470812261971653,
91
- "grad_norm": 1.209277868270874,
92
- "learning_rate": 3.142857142857143e-05,
93
- "loss": 3.2744,
94
  "step": 11
95
  },
96
  {
97
- "epoch": 0.0029968158831241805,
98
- "grad_norm": 1.04523766040802,
99
- "learning_rate": 3.428571428571429e-05,
100
- "loss": 3.1888,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 0.0032465505400511956,
105
- "grad_norm": 1.0623782873153687,
106
- "learning_rate": 3.7142857142857143e-05,
107
- "loss": 3.1969,
108
  "step": 13
109
  },
110
  {
111
- "epoch": 0.0034962851969782107,
112
- "grad_norm": 1.007890224456787,
113
- "learning_rate": 4e-05,
114
- "loss": 3.2903,
115
  "step": 14
116
  },
117
  {
118
- "epoch": 0.0037460198539052258,
119
- "grad_norm": 0.8465790152549744,
120
- "learning_rate": 4.2857142857142856e-05,
121
- "loss": 3.0649,
122
  "step": 15
123
  },
124
  {
125
- "epoch": 0.003995754510832241,
126
- "grad_norm": 0.8478276133537292,
127
- "learning_rate": 4.5714285714285716e-05,
128
- "loss": 3.1391,
129
  "step": 16
130
  },
131
  {
132
- "epoch": 0.004245489167759256,
133
- "grad_norm": 0.7651631236076355,
134
- "learning_rate": 4.8571428571428576e-05,
135
- "loss": 3.1899,
136
  "step": 17
137
  },
138
  {
139
- "epoch": 0.004495223824686271,
140
- "grad_norm": 0.7943688631057739,
141
- "learning_rate": 5.142857142857143e-05,
142
- "loss": 3.051,
143
  "step": 18
144
  },
145
  {
146
- "epoch": 0.004744958481613286,
147
- "grad_norm": 0.7817432284355164,
148
- "learning_rate": 5.428571428571428e-05,
149
- "loss": 3.0732,
150
  "step": 19
151
  },
152
  {
153
- "epoch": 0.004994693138540301,
154
- "grad_norm": 0.9154627919197083,
155
- "learning_rate": 5.714285714285714e-05,
156
- "loss": 3.1392,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 0.005244427795467316,
161
- "grad_norm": 0.9029829502105713,
162
- "learning_rate": 6e-05,
163
- "loss": 3.21,
164
  "step": 21
165
  },
166
  {
167
- "epoch": 0.005494162452394331,
168
- "grad_norm": 0.8615948557853699,
169
- "learning_rate": 6.285714285714286e-05,
170
- "loss": 2.9363,
171
  "step": 22
172
  },
173
  {
174
- "epoch": 0.005743897109321346,
175
- "grad_norm": 0.8935572504997253,
176
- "learning_rate": 6.571428571428571e-05,
177
- "loss": 2.8907,
178
  "step": 23
179
  },
180
  {
181
- "epoch": 0.005993631766248361,
182
- "grad_norm": 0.7808010578155518,
183
- "learning_rate": 6.857142857142858e-05,
184
- "loss": 3.0218,
185
  "step": 24
186
  },
187
  {
188
- "epoch": 0.006243366423175376,
189
- "grad_norm": 0.7767881751060486,
190
- "learning_rate": 7.142857142857143e-05,
191
- "loss": 2.981,
192
  "step": 25
193
  },
194
  {
195
- "epoch": 0.006243366423175376,
196
- "eval_loss": 3.0071640014648438,
197
- "eval_runtime": 2.6827,
198
- "eval_samples_per_second": 18.638,
199
- "eval_steps_per_second": 2.982,
200
  "step": 25
201
  },
202
  {
203
- "epoch": 0.006493101080102391,
204
- "grad_norm": 0.9288599491119385,
205
- "learning_rate": 7.428571428571429e-05,
206
- "loss": 2.9497,
207
  "step": 26
208
  },
209
  {
210
- "epoch": 0.006742835737029406,
211
- "grad_norm": 0.7947930097579956,
212
- "learning_rate": 7.714285714285715e-05,
213
- "loss": 2.8892,
214
  "step": 27
215
  },
216
  {
217
- "epoch": 0.006992570393956421,
218
- "grad_norm": 0.9469573497772217,
219
- "learning_rate": 8e-05,
220
- "loss": 3.2229,
221
  "step": 28
222
  },
223
  {
224
- "epoch": 0.0072423050508834364,
225
- "grad_norm": 0.820366382598877,
226
- "learning_rate": 8.285714285714287e-05,
227
- "loss": 2.8336,
228
  "step": 29
229
  },
230
  {
231
- "epoch": 0.0074920397078104516,
232
- "grad_norm": 0.7744409441947937,
233
- "learning_rate": 8.571428571428571e-05,
234
- "loss": 2.7295,
235
  "step": 30
236
  },
237
  {
238
- "epoch": 0.007741774364737467,
239
- "grad_norm": 0.890326201915741,
240
- "learning_rate": 8.857142857142857e-05,
241
- "loss": 2.8798,
242
  "step": 31
243
  },
244
  {
245
- "epoch": 0.007991509021664482,
246
- "grad_norm": 0.9005088210105896,
247
- "learning_rate": 9.142857142857143e-05,
248
- "loss": 2.9725,
249
  "step": 32
250
  },
251
  {
252
- "epoch": 0.008241243678591496,
253
- "grad_norm": 0.9345659017562866,
254
- "learning_rate": 9.428571428571429e-05,
255
- "loss": 2.9779,
256
  "step": 33
257
  },
258
  {
259
- "epoch": 0.008490978335518512,
260
- "grad_norm": 0.8667367100715637,
261
- "learning_rate": 9.714285714285715e-05,
262
- "loss": 2.7893,
263
  "step": 34
264
  },
265
  {
266
- "epoch": 0.008740712992445526,
267
- "grad_norm": 0.8754308819770813,
268
- "learning_rate": 0.0001,
269
- "loss": 2.9642,
270
  "step": 35
271
  },
272
  {
273
- "epoch": 0.008990447649372542,
274
- "grad_norm": 0.9180012941360474,
275
- "learning_rate": 0.00010285714285714286,
276
- "loss": 2.7238,
277
  "step": 36
278
  },
279
  {
280
- "epoch": 0.009240182306299556,
281
- "grad_norm": 0.8926249146461487,
282
- "learning_rate": 0.00010571428571428572,
283
- "loss": 2.7661,
284
  "step": 37
285
  },
286
  {
287
- "epoch": 0.009489916963226572,
288
- "grad_norm": 0.8704882264137268,
289
- "learning_rate": 0.00010857142857142856,
290
- "loss": 2.9965,
291
  "step": 38
292
  },
293
  {
294
- "epoch": 0.009739651620153587,
295
- "grad_norm": 0.8835905194282532,
296
- "learning_rate": 0.00011142857142857144,
297
- "loss": 2.7978,
298
  "step": 39
299
  },
300
  {
301
- "epoch": 0.009989386277080603,
302
- "grad_norm": 0.8821542263031006,
303
- "learning_rate": 0.00011428571428571428,
304
- "loss": 2.8393,
305
  "step": 40
306
  },
307
  {
308
- "epoch": 0.010239120934007617,
309
- "grad_norm": 1.0209946632385254,
310
- "learning_rate": 0.00011714285714285715,
311
- "loss": 2.9775,
312
  "step": 41
313
  },
314
  {
315
- "epoch": 0.010488855590934631,
316
- "grad_norm": 0.8903468251228333,
317
- "learning_rate": 0.00012,
318
- "loss": 2.8332,
319
  "step": 42
320
  },
321
  {
322
- "epoch": 0.010738590247861647,
323
- "grad_norm": 1.0161865949630737,
324
- "learning_rate": 0.00012285714285714287,
325
- "loss": 2.9317,
326
  "step": 43
327
  },
328
  {
329
- "epoch": 0.010988324904788661,
330
- "grad_norm": 0.9254084229469299,
331
- "learning_rate": 0.00012571428571428572,
332
- "loss": 2.8001,
333
  "step": 44
334
  },
335
  {
336
- "epoch": 0.011238059561715677,
337
- "grad_norm": 0.8981170654296875,
338
- "learning_rate": 0.00012857142857142858,
339
- "loss": 3.0302,
340
  "step": 45
341
  },
342
  {
343
- "epoch": 0.011487794218642692,
344
- "grad_norm": 0.9837843775749207,
345
- "learning_rate": 0.00013142857142857143,
346
- "loss": 2.6885,
347
  "step": 46
348
  },
349
  {
350
- "epoch": 0.011737528875569708,
351
- "grad_norm": 0.9914678335189819,
352
- "learning_rate": 0.00013428571428571428,
353
- "loss": 2.9183,
354
  "step": 47
355
  },
356
  {
357
- "epoch": 0.011987263532496722,
358
- "grad_norm": 1.0084929466247559,
359
- "learning_rate": 0.00013714285714285716,
360
- "loss": 2.8724,
361
  "step": 48
362
  },
363
  {
364
- "epoch": 0.012236998189423738,
365
- "grad_norm": 1.538197636604309,
366
- "learning_rate": 0.00014,
367
- "loss": 2.7356,
368
  "step": 49
369
  },
370
  {
371
- "epoch": 0.012486732846350752,
372
- "grad_norm": 2.431025743484497,
373
- "learning_rate": 0.00014285714285714287,
374
- "loss": 2.8119,
375
  "step": 50
376
  },
377
  {
378
- "epoch": 0.012486732846350752,
379
- "eval_loss": 2.811203956604004,
380
- "eval_runtime": 2.6823,
381
- "eval_samples_per_second": 18.641,
382
- "eval_steps_per_second": 2.983,
383
  "step": 50
384
- },
385
- {
386
- "epoch": 0.012736467503277768,
387
- "grad_norm": 1.0722112655639648,
388
- "learning_rate": 0.00014571428571428572,
389
- "loss": 3.1445,
390
- "step": 51
391
- },
392
- {
393
- "epoch": 0.012986202160204782,
394
- "grad_norm": 1.0569753646850586,
395
- "learning_rate": 0.00014857142857142857,
396
- "loss": 2.9575,
397
- "step": 52
398
- },
399
- {
400
- "epoch": 0.013235936817131798,
401
- "grad_norm": 0.8477814197540283,
402
- "learning_rate": 0.00015142857142857143,
403
- "loss": 2.9096,
404
- "step": 53
405
- },
406
- {
407
- "epoch": 0.013485671474058812,
408
- "grad_norm": 0.7475413084030151,
409
- "learning_rate": 0.0001542857142857143,
410
- "loss": 2.7139,
411
- "step": 54
412
- },
413
- {
414
- "epoch": 0.013735406130985827,
415
- "grad_norm": 0.6939762234687805,
416
- "learning_rate": 0.00015714285714285716,
417
- "loss": 2.9446,
418
- "step": 55
419
- },
420
- {
421
- "epoch": 0.013985140787912843,
422
- "grad_norm": 0.772847056388855,
423
- "learning_rate": 0.00016,
424
- "loss": 2.7692,
425
- "step": 56
426
- },
427
- {
428
- "epoch": 0.014234875444839857,
429
- "grad_norm": 0.7479884028434753,
430
- "learning_rate": 0.00016285714285714287,
431
- "loss": 2.7092,
432
- "step": 57
433
- },
434
- {
435
- "epoch": 0.014484610101766873,
436
- "grad_norm": 0.7816758155822754,
437
- "learning_rate": 0.00016571428571428575,
438
- "loss": 2.8473,
439
- "step": 58
440
- },
441
- {
442
- "epoch": 0.014734344758693887,
443
- "grad_norm": 0.7092522382736206,
444
- "learning_rate": 0.00016857142857142857,
445
- "loss": 2.706,
446
- "step": 59
447
- },
448
- {
449
- "epoch": 0.014984079415620903,
450
- "grad_norm": 0.6997970938682556,
451
- "learning_rate": 0.00017142857142857143,
452
- "loss": 2.8594,
453
- "step": 60
454
- },
455
- {
456
- "epoch": 0.015233814072547917,
457
- "grad_norm": 0.654046356678009,
458
- "learning_rate": 0.0001742857142857143,
459
- "loss": 2.7968,
460
- "step": 61
461
- },
462
- {
463
- "epoch": 0.015483548729474933,
464
- "grad_norm": 0.6976904273033142,
465
- "learning_rate": 0.00017714285714285713,
466
- "loss": 2.8346,
467
- "step": 62
468
- },
469
- {
470
- "epoch": 0.01573328338640195,
471
- "grad_norm": 0.7366370558738708,
472
- "learning_rate": 0.00018,
473
- "loss": 2.8191,
474
- "step": 63
475
- },
476
- {
477
- "epoch": 0.015983018043328964,
478
- "grad_norm": 0.7815442085266113,
479
- "learning_rate": 0.00018285714285714286,
480
- "loss": 2.8074,
481
- "step": 64
482
- },
483
- {
484
- "epoch": 0.016232752700255978,
485
- "grad_norm": 0.7780041694641113,
486
- "learning_rate": 0.00018571428571428572,
487
- "loss": 2.7443,
488
- "step": 65
489
- },
490
- {
491
- "epoch": 0.016482487357182992,
492
- "grad_norm": 0.6779807806015015,
493
- "learning_rate": 0.00018857142857142857,
494
- "loss": 2.6378,
495
- "step": 66
496
- },
497
- {
498
- "epoch": 0.01673222201411001,
499
- "grad_norm": 0.6851223111152649,
500
- "learning_rate": 0.00019142857142857145,
501
- "loss": 2.4737,
502
- "step": 67
503
- },
504
- {
505
- "epoch": 0.016981956671037024,
506
- "grad_norm": 0.741107702255249,
507
- "learning_rate": 0.0001942857142857143,
508
- "loss": 2.708,
509
- "step": 68
510
- },
511
- {
512
- "epoch": 0.017231691327964038,
513
- "grad_norm": 0.7790065407752991,
514
- "learning_rate": 0.00019714285714285716,
515
- "loss": 2.7973,
516
- "step": 69
517
- },
518
- {
519
- "epoch": 0.017481425984891052,
520
- "grad_norm": 0.8207802176475525,
521
- "learning_rate": 0.0002,
522
- "loss": 2.7328,
523
- "step": 70
524
- },
525
- {
526
- "epoch": 0.017731160641818067,
527
- "grad_norm": 0.8045083284378052,
528
- "learning_rate": 0.00019999984264028182,
529
- "loss": 2.6503,
530
- "step": 71
531
- },
532
- {
533
- "epoch": 0.017980895298745084,
534
- "grad_norm": 0.8912211060523987,
535
- "learning_rate": 0.0001999993705616775,
536
- "loss": 2.6638,
537
- "step": 72
538
- },
539
- {
540
- "epoch": 0.0182306299556721,
541
- "grad_norm": 0.8193833231925964,
542
- "learning_rate": 0.0001999985837658379,
543
- "loss": 2.3789,
544
- "step": 73
545
- },
546
- {
547
- "epoch": 0.018480364612599113,
548
- "grad_norm": 0.8749593496322632,
549
- "learning_rate": 0.0001999974822555143,
550
- "loss": 2.6006,
551
- "step": 74
552
- },
553
- {
554
- "epoch": 0.018730099269526127,
555
- "grad_norm": 0.911247730255127,
556
- "learning_rate": 0.00019999606603455857,
557
- "loss": 2.6721,
558
- "step": 75
559
- },
560
- {
561
- "epoch": 0.018730099269526127,
562
- "eval_loss": 2.6788198947906494,
563
- "eval_runtime": 2.6857,
564
- "eval_samples_per_second": 18.617,
565
- "eval_steps_per_second": 2.979,
566
- "step": 75
567
- },
568
- {
569
- "epoch": 0.018979833926453145,
570
- "grad_norm": 0.9761448502540588,
571
- "learning_rate": 0.00019999433510792307,
572
- "loss": 2.7212,
573
- "step": 76
574
- },
575
- {
576
- "epoch": 0.01922956858338016,
577
- "grad_norm": 0.7784520983695984,
578
- "learning_rate": 0.00019999228948166064,
579
- "loss": 2.7151,
580
- "step": 77
581
- },
582
- {
583
- "epoch": 0.019479303240307173,
584
- "grad_norm": 0.8097946047782898,
585
- "learning_rate": 0.00019998992916292463,
586
- "loss": 2.4256,
587
- "step": 78
588
- },
589
- {
590
- "epoch": 0.019729037897234188,
591
- "grad_norm": 0.8229548931121826,
592
- "learning_rate": 0.00019998725415996875,
593
- "loss": 2.482,
594
- "step": 79
595
- },
596
- {
597
- "epoch": 0.019978772554161205,
598
- "grad_norm": 0.8396543860435486,
599
- "learning_rate": 0.00019998426448214718,
600
- "loss": 2.5676,
601
- "step": 80
602
- },
603
- {
604
- "epoch": 0.02022850721108822,
605
- "grad_norm": 0.84672611951828,
606
- "learning_rate": 0.0001999809601399145,
607
- "loss": 2.4699,
608
- "step": 81
609
- },
610
- {
611
- "epoch": 0.020478241868015234,
612
- "grad_norm": 0.8323167562484741,
613
- "learning_rate": 0.0001999773411448256,
614
- "loss": 2.6894,
615
- "step": 82
616
- },
617
- {
618
- "epoch": 0.020727976524942248,
619
- "grad_norm": 0.9671497344970703,
620
- "learning_rate": 0.00019997340750953566,
621
- "loss": 2.6653,
622
- "step": 83
623
- },
624
- {
625
- "epoch": 0.020977711181869262,
626
- "grad_norm": 1.0757114887237549,
627
- "learning_rate": 0.00019996915924780015,
628
- "loss": 2.6165,
629
- "step": 84
630
- },
631
- {
632
- "epoch": 0.02122744583879628,
633
- "grad_norm": 0.8748140335083008,
634
- "learning_rate": 0.00019996459637447477,
635
- "loss": 2.582,
636
- "step": 85
637
- },
638
- {
639
- "epoch": 0.021477180495723294,
640
- "grad_norm": 0.8593396544456482,
641
- "learning_rate": 0.0001999597189055153,
642
- "loss": 2.4936,
643
- "step": 86
644
- },
645
- {
646
- "epoch": 0.02172691515265031,
647
- "grad_norm": 0.933760941028595,
648
- "learning_rate": 0.00019995452685797773,
649
- "loss": 2.8435,
650
- "step": 87
651
- },
652
- {
653
- "epoch": 0.021976649809577323,
654
- "grad_norm": 0.8785303235054016,
655
- "learning_rate": 0.00019994902025001802,
656
- "loss": 2.4451,
657
- "step": 88
658
- },
659
- {
660
- "epoch": 0.02222638446650434,
661
- "grad_norm": 0.8894078731536865,
662
- "learning_rate": 0.0001999431991008921,
663
- "loss": 2.5532,
664
- "step": 89
665
- },
666
- {
667
- "epoch": 0.022476119123431355,
668
- "grad_norm": 0.9705137610435486,
669
- "learning_rate": 0.00019993706343095588,
670
- "loss": 2.7539,
671
- "step": 90
672
- },
673
- {
674
- "epoch": 0.02272585378035837,
675
- "grad_norm": 1.0564568042755127,
676
- "learning_rate": 0.0001999306132616651,
677
- "loss": 2.7694,
678
- "step": 91
679
- },
680
- {
681
- "epoch": 0.022975588437285383,
682
- "grad_norm": 1.0100560188293457,
683
- "learning_rate": 0.00019992384861557515,
684
- "loss": 2.6056,
685
- "step": 92
686
- },
687
- {
688
- "epoch": 0.0232253230942124,
689
- "grad_norm": 1.0753816366195679,
690
- "learning_rate": 0.00019991676951634132,
691
- "loss": 2.4921,
692
- "step": 93
693
- },
694
- {
695
- "epoch": 0.023475057751139415,
696
- "grad_norm": 0.9264618158340454,
697
- "learning_rate": 0.00019990937598871834,
698
- "loss": 2.6055,
699
- "step": 94
700
- },
701
- {
702
- "epoch": 0.02372479240806643,
703
- "grad_norm": 1.0126386880874634,
704
- "learning_rate": 0.00019990166805856048,
705
- "loss": 2.4358,
706
- "step": 95
707
- },
708
- {
709
- "epoch": 0.023974527064993444,
710
- "grad_norm": 0.9737821817398071,
711
- "learning_rate": 0.0001998936457528215,
712
- "loss": 2.7028,
713
- "step": 96
714
- },
715
- {
716
- "epoch": 0.024224261721920458,
717
- "grad_norm": 1.0489351749420166,
718
- "learning_rate": 0.00019988530909955448,
719
- "loss": 2.6114,
720
- "step": 97
721
- },
722
- {
723
- "epoch": 0.024473996378847476,
724
- "grad_norm": 1.1334600448608398,
725
- "learning_rate": 0.00019987665812791166,
726
- "loss": 2.8804,
727
- "step": 98
728
- },
729
- {
730
- "epoch": 0.02472373103577449,
731
- "grad_norm": 1.2668412923812866,
732
- "learning_rate": 0.0001998676928681445,
733
- "loss": 2.9214,
734
- "step": 99
735
- },
736
- {
737
- "epoch": 0.024973465692701504,
738
- "grad_norm": 2.203848361968994,
739
- "learning_rate": 0.0001998584133516035,
740
- "loss": 2.9326,
741
- "step": 100
742
- },
743
- {
744
- "epoch": 0.024973465692701504,
745
- "eval_loss": 2.6431591510772705,
746
- "eval_runtime": 2.6774,
747
- "eval_samples_per_second": 18.675,
748
- "eval_steps_per_second": 2.988,
749
- "step": 100
750
- },
751
- {
752
- "epoch": 0.02522320034962852,
753
- "grad_norm": 0.9147284030914307,
754
- "learning_rate": 0.00019984881961073798,
755
- "loss": 2.6948,
756
- "step": 101
757
- },
758
- {
759
- "epoch": 0.025472935006555536,
760
- "grad_norm": 0.8852336406707764,
761
- "learning_rate": 0.00019983891167909616,
762
- "loss": 2.5529,
763
- "step": 102
764
- },
765
- {
766
- "epoch": 0.02572266966348255,
767
- "grad_norm": 0.7818748354911804,
768
- "learning_rate": 0.00019982868959132492,
769
- "loss": 2.7862,
770
- "step": 103
771
- },
772
- {
773
- "epoch": 0.025972404320409564,
774
- "grad_norm": 0.6491966247558594,
775
- "learning_rate": 0.00019981815338316968,
776
- "loss": 2.6352,
777
- "step": 104
778
- },
779
- {
780
- "epoch": 0.02622213897733658,
781
- "grad_norm": 0.6427002549171448,
782
- "learning_rate": 0.00019980730309147434,
783
- "loss": 2.7408,
784
- "step": 105
785
- },
786
- {
787
- "epoch": 0.026471873634263596,
788
- "grad_norm": 0.6891926527023315,
789
- "learning_rate": 0.00019979613875418107,
790
- "loss": 2.6471,
791
- "step": 106
792
- },
793
- {
794
- "epoch": 0.02672160829119061,
795
- "grad_norm": 0.8082403540611267,
796
- "learning_rate": 0.00019978466041033026,
797
- "loss": 2.5825,
798
- "step": 107
799
- },
800
- {
801
- "epoch": 0.026971342948117625,
802
- "grad_norm": 0.7740591764450073,
803
- "learning_rate": 0.00019977286810006034,
804
- "loss": 2.9007,
805
- "step": 108
806
- },
807
- {
808
- "epoch": 0.02722107760504464,
809
- "grad_norm": 0.7396791577339172,
810
- "learning_rate": 0.00019976076186460764,
811
- "loss": 2.6945,
812
- "step": 109
813
- },
814
- {
815
- "epoch": 0.027470812261971653,
816
- "grad_norm": 0.6720578670501709,
817
- "learning_rate": 0.00019974834174630622,
818
- "loss": 2.7139,
819
- "step": 110
820
- },
821
- {
822
- "epoch": 0.02772054691889867,
823
- "grad_norm": 0.8105450868606567,
824
- "learning_rate": 0.0001997356077885878,
825
- "loss": 2.7112,
826
- "step": 111
827
- },
828
- {
829
- "epoch": 0.027970281575825685,
830
- "grad_norm": 0.7144357562065125,
831
- "learning_rate": 0.00019972256003598153,
832
- "loss": 2.3851,
833
- "step": 112
834
- },
835
- {
836
- "epoch": 0.0282200162327527,
837
- "grad_norm": 0.7595510482788086,
838
- "learning_rate": 0.00019970919853411385,
839
- "loss": 2.6396,
840
- "step": 113
841
- },
842
- {
843
- "epoch": 0.028469750889679714,
844
- "grad_norm": 0.7179970145225525,
845
- "learning_rate": 0.0001996955233297084,
846
- "loss": 2.3639,
847
- "step": 114
848
- },
849
- {
850
- "epoch": 0.02871948554660673,
851
- "grad_norm": 0.7306939959526062,
852
- "learning_rate": 0.00019968153447058576,
853
- "loss": 2.5145,
854
- "step": 115
855
- },
856
- {
857
- "epoch": 0.028969220203533746,
858
- "grad_norm": 0.7519063353538513,
859
- "learning_rate": 0.00019966723200566327,
860
- "loss": 2.4479,
861
- "step": 116
862
- },
863
- {
864
- "epoch": 0.02921895486046076,
865
- "grad_norm": 0.8461882472038269,
866
- "learning_rate": 0.00019965261598495502,
867
- "loss": 2.5828,
868
- "step": 117
869
- },
870
- {
871
- "epoch": 0.029468689517387774,
872
- "grad_norm": 0.7643318772315979,
873
- "learning_rate": 0.00019963768645957152,
874
- "loss": 2.5821,
875
- "step": 118
876
- },
877
- {
878
- "epoch": 0.029718424174314792,
879
- "grad_norm": 0.7931061387062073,
880
- "learning_rate": 0.00019962244348171958,
881
- "loss": 2.5467,
882
- "step": 119
883
- },
884
- {
885
- "epoch": 0.029968158831241806,
886
- "grad_norm": 0.7353018522262573,
887
- "learning_rate": 0.00019960688710470205,
888
- "loss": 2.5064,
889
- "step": 120
890
- },
891
- {
892
- "epoch": 0.03021789348816882,
893
- "grad_norm": 0.7961385846138,
894
- "learning_rate": 0.0001995910173829178,
895
- "loss": 2.2525,
896
- "step": 121
897
- },
898
- {
899
- "epoch": 0.030467628145095835,
900
- "grad_norm": 0.8227445483207703,
901
- "learning_rate": 0.00019957483437186137,
902
- "loss": 2.3514,
903
- "step": 122
904
- },
905
- {
906
- "epoch": 0.030717362802022852,
907
- "grad_norm": 0.8792641758918762,
908
- "learning_rate": 0.0001995583381281229,
909
- "loss": 2.6704,
910
- "step": 123
911
- },
912
- {
913
- "epoch": 0.030967097458949867,
914
- "grad_norm": 0.8991889357566833,
915
- "learning_rate": 0.0001995415287093877,
916
- "loss": 2.7915,
917
- "step": 124
918
- },
919
- {
920
- "epoch": 0.03121683211587688,
921
- "grad_norm": 0.8456166982650757,
922
- "learning_rate": 0.00019952440617443647,
923
- "loss": 2.5482,
924
- "step": 125
925
- },
926
- {
927
- "epoch": 0.03121683211587688,
928
- "eval_loss": 2.5769076347351074,
929
- "eval_runtime": 2.6825,
930
- "eval_samples_per_second": 18.639,
931
- "eval_steps_per_second": 2.982,
932
- "step": 125
933
- },
934
- {
935
- "epoch": 0.0314665667728039,
936
- "grad_norm": 0.9480443000793457,
937
- "learning_rate": 0.0001995069705831446,
938
- "loss": 2.6215,
939
- "step": 126
940
- },
941
- {
942
- "epoch": 0.03171630142973091,
943
- "grad_norm": 0.8263009786605835,
944
- "learning_rate": 0.00019948922199648235,
945
- "loss": 2.5919,
946
- "step": 127
947
- },
948
- {
949
- "epoch": 0.03196603608665793,
950
- "grad_norm": 0.7829933166503906,
951
- "learning_rate": 0.00019947116047651448,
952
- "loss": 2.4247,
953
- "step": 128
954
- },
955
- {
956
- "epoch": 0.03221577074358494,
957
- "grad_norm": 0.7913976311683655,
958
- "learning_rate": 0.00019945278608639994,
959
- "loss": 2.5516,
960
- "step": 129
961
- },
962
- {
963
- "epoch": 0.032465505400511956,
964
- "grad_norm": 0.8564252257347107,
965
- "learning_rate": 0.00019943409889039188,
966
- "loss": 2.3555,
967
- "step": 130
968
- },
969
- {
970
- "epoch": 0.03271524005743897,
971
- "grad_norm": 0.809696614742279,
972
- "learning_rate": 0.0001994150989538371,
973
- "loss": 2.4586,
974
- "step": 131
975
- },
976
- {
977
- "epoch": 0.032964974714365984,
978
- "grad_norm": 0.8882852792739868,
979
- "learning_rate": 0.0001993957863431763,
980
- "loss": 2.459,
981
- "step": 132
982
- },
983
- {
984
- "epoch": 0.033214709371293,
985
- "grad_norm": 0.9187607765197754,
986
- "learning_rate": 0.0001993761611259434,
987
- "loss": 2.6238,
988
- "step": 133
989
- },
990
- {
991
- "epoch": 0.03346444402822002,
992
- "grad_norm": 0.8577990531921387,
993
- "learning_rate": 0.00019935622337076536,
994
- "loss": 2.5431,
995
- "step": 134
996
- },
997
- {
998
- "epoch": 0.033714178685147034,
999
- "grad_norm": 0.8696761727333069,
1000
- "learning_rate": 0.00019933597314736228,
1001
- "loss": 2.5734,
1002
- "step": 135
1003
- },
1004
- {
1005
- "epoch": 0.03396391334207405,
1006
- "grad_norm": 0.828972578048706,
1007
- "learning_rate": 0.0001993154105265468,
1008
- "loss": 2.6162,
1009
- "step": 136
1010
- },
1011
- {
1012
- "epoch": 0.03421364799900106,
1013
- "grad_norm": 0.9171616435050964,
1014
- "learning_rate": 0.00019929453558022392,
1015
- "loss": 2.4986,
1016
- "step": 137
1017
- },
1018
- {
1019
- "epoch": 0.034463382655928076,
1020
- "grad_norm": 0.8592952489852905,
1021
- "learning_rate": 0.00019927334838139098,
1022
- "loss": 2.4374,
1023
- "step": 138
1024
- },
1025
- {
1026
- "epoch": 0.03471311731285509,
1027
- "grad_norm": 0.969752311706543,
1028
- "learning_rate": 0.00019925184900413705,
1029
- "loss": 2.3063,
1030
- "step": 139
1031
- },
1032
- {
1033
- "epoch": 0.034962851969782105,
1034
- "grad_norm": 0.863645076751709,
1035
- "learning_rate": 0.00019923003752364297,
1036
- "loss": 2.3734,
1037
- "step": 140
1038
- },
1039
- {
1040
- "epoch": 0.03521258662670912,
1041
- "grad_norm": 1.038589596748352,
1042
- "learning_rate": 0.00019920791401618088,
1043
- "loss": 2.6056,
1044
- "step": 141
1045
- },
1046
- {
1047
- "epoch": 0.03546232128363613,
1048
- "grad_norm": 1.0344328880310059,
1049
- "learning_rate": 0.00019918547855911413,
1050
- "loss": 2.5048,
1051
- "step": 142
1052
- },
1053
- {
1054
- "epoch": 0.035712055940563155,
1055
- "grad_norm": 0.883296012878418,
1056
- "learning_rate": 0.0001991627312308969,
1057
- "loss": 2.4579,
1058
- "step": 143
1059
- },
1060
- {
1061
- "epoch": 0.03596179059749017,
1062
- "grad_norm": 0.9329782724380493,
1063
- "learning_rate": 0.00019913967211107386,
1064
- "loss": 2.4917,
1065
- "step": 144
1066
- },
1067
- {
1068
- "epoch": 0.03621152525441718,
1069
- "grad_norm": 0.9287551641464233,
1070
- "learning_rate": 0.00019911630128028012,
1071
- "loss": 2.648,
1072
- "step": 145
1073
- },
1074
- {
1075
- "epoch": 0.0364612599113442,
1076
- "grad_norm": 0.9368601441383362,
1077
- "learning_rate": 0.00019909261882024065,
1078
- "loss": 2.4704,
1079
- "step": 146
1080
- },
1081
- {
1082
- "epoch": 0.03671099456827121,
1083
- "grad_norm": 0.9918820261955261,
1084
- "learning_rate": 0.00019906862481377033,
1085
- "loss": 2.3632,
1086
- "step": 147
1087
- },
1088
- {
1089
- "epoch": 0.036960729225198226,
1090
- "grad_norm": 1.0322192907333374,
1091
- "learning_rate": 0.0001990443193447733,
1092
- "loss": 2.5039,
1093
- "step": 148
1094
- },
1095
- {
1096
- "epoch": 0.03721046388212524,
1097
- "grad_norm": 1.307169795036316,
1098
- "learning_rate": 0.000199019702498243,
1099
- "loss": 2.483,
1100
- "step": 149
1101
- },
1102
- {
1103
- "epoch": 0.037460198539052254,
1104
- "grad_norm": 2.3513553142547607,
1105
- "learning_rate": 0.00019899477436026157,
1106
- "loss": 2.3276,
1107
- "step": 150
1108
- },
1109
- {
1110
- "epoch": 0.037460198539052254,
1111
- "eval_loss": 2.570774555206299,
1112
- "eval_runtime": 2.6997,
1113
- "eval_samples_per_second": 18.521,
1114
- "eval_steps_per_second": 2.963,
1115
- "step": 150
1116
  }
1117
  ],
1118
  "logging_steps": 1,
1119
- "max_steps": 1750,
1120
  "num_input_tokens_seen": 0,
1121
  "num_train_epochs": 1,
1122
  "save_steps": 50,
@@ -1141,8 +409,8 @@
1141
  "attributes": {}
1142
  }
1143
  },
1144
- "total_flos": 5.089011062695526e+16,
1145
- "train_batch_size": 7,
1146
  "trial_name": null,
1147
  "trial_params": null
1148
  }
 
1
  {
2
+ "best_metric": 2.556696653366089,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-50",
4
+ "epoch": 0.014271060964188556,
5
  "eval_steps": 25,
6
+ "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0002854212192837711,
13
+ "grad_norm": 0.9852302670478821,
14
+ "learning_rate": 5.7142857142857145e-06,
15
+ "loss": 3.0904,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.0002854212192837711,
20
+ "eval_loss": 3.2887184619903564,
21
+ "eval_runtime": 6.5686,
22
+ "eval_samples_per_second": 7.612,
23
+ "eval_steps_per_second": 7.612,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.0005708424385675422,
28
+ "grad_norm": 0.8380288481712341,
29
+ "learning_rate": 1.1428571428571429e-05,
30
+ "loss": 3.1387,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.0008562636578513134,
35
+ "grad_norm": 0.8498082160949707,
36
+ "learning_rate": 1.7142857142857145e-05,
37
+ "loss": 3.1155,
38
  "step": 3
39
  },
40
  {
41
+ "epoch": 0.0011416848771350844,
42
+ "grad_norm": 0.932189404964447,
43
+ "learning_rate": 2.2857142857142858e-05,
44
+ "loss": 3.1839,
45
  "step": 4
46
  },
47
  {
48
+ "epoch": 0.0014271060964188556,
49
+ "grad_norm": 0.8401004076004028,
50
+ "learning_rate": 2.857142857142857e-05,
51
+ "loss": 3.149,
52
  "step": 5
53
  },
54
  {
55
+ "epoch": 0.0017125273157026267,
56
+ "grad_norm": 0.8718746900558472,
57
+ "learning_rate": 3.428571428571429e-05,
58
+ "loss": 3.1855,
59
  "step": 6
60
  },
61
  {
62
+ "epoch": 0.001997948534986398,
63
+ "grad_norm": 0.8892223238945007,
64
+ "learning_rate": 4e-05,
65
+ "loss": 3.0531,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.002283369754270169,
70
+ "grad_norm": 0.8379232287406921,
71
+ "learning_rate": 4.5714285714285716e-05,
72
+ "loss": 3.042,
73
  "step": 8
74
  },
75
  {
76
+ "epoch": 0.00256879097355394,
77
+ "grad_norm": 0.8372194170951843,
78
+ "learning_rate": 5.142857142857143e-05,
79
+ "loss": 2.9784,
80
  "step": 9
81
  },
82
  {
83
+ "epoch": 0.002854212192837711,
84
+ "grad_norm": 0.7960942983627319,
85
+ "learning_rate": 5.714285714285714e-05,
86
+ "loss": 3.1879,
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.0031396334121214825,
91
+ "grad_norm": 0.8907091021537781,
92
+ "learning_rate": 6.285714285714286e-05,
93
+ "loss": 3.0808,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.0034250546314052535,
98
+ "grad_norm": 0.6043815016746521,
99
+ "learning_rate": 6.857142857142858e-05,
100
+ "loss": 2.9428,
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.003710475850689025,
105
+ "grad_norm": 0.564944863319397,
106
+ "learning_rate": 7.428571428571429e-05,
107
+ "loss": 3.0157,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.003995897069972796,
112
+ "grad_norm": 0.5410389304161072,
113
+ "learning_rate": 8e-05,
114
+ "loss": 2.9157,
115
  "step": 14
116
  },
117
  {
118
+ "epoch": 0.004281318289256567,
119
+ "grad_norm": 0.5174809098243713,
120
+ "learning_rate": 8.571428571428571e-05,
121
+ "loss": 2.7908,
122
  "step": 15
123
  },
124
  {
125
+ "epoch": 0.004566739508540338,
126
+ "grad_norm": 0.5930260419845581,
127
+ "learning_rate": 9.142857142857143e-05,
128
+ "loss": 2.8472,
129
  "step": 16
130
  },
131
  {
132
+ "epoch": 0.0048521607278241095,
133
+ "grad_norm": 0.6121829152107239,
134
+ "learning_rate": 9.714285714285715e-05,
135
+ "loss": 2.8535,
136
  "step": 17
137
  },
138
  {
139
+ "epoch": 0.00513758194710788,
140
+ "grad_norm": 0.5915908813476562,
141
+ "learning_rate": 0.00010285714285714286,
142
+ "loss": 2.6747,
143
  "step": 18
144
  },
145
  {
146
+ "epoch": 0.005423003166391651,
147
+ "grad_norm": 0.5632700324058533,
148
+ "learning_rate": 0.00010857142857142856,
149
+ "loss": 2.6425,
150
  "step": 19
151
  },
152
  {
153
+ "epoch": 0.005708424385675422,
154
+ "grad_norm": 0.5865013599395752,
155
+ "learning_rate": 0.00011428571428571428,
156
+ "loss": 2.8892,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.005993845604959194,
161
+ "grad_norm": 0.6056029796600342,
162
+ "learning_rate": 0.00012,
163
+ "loss": 2.9813,
164
  "step": 21
165
  },
166
  {
167
+ "epoch": 0.006279266824242965,
168
+ "grad_norm": 0.5435022711753845,
169
+ "learning_rate": 0.00012571428571428572,
170
+ "loss": 2.6502,
171
  "step": 22
172
  },
173
  {
174
+ "epoch": 0.006564688043526736,
175
+ "grad_norm": 0.5630282163619995,
176
+ "learning_rate": 0.00013142857142857143,
177
+ "loss": 2.6238,
178
  "step": 23
179
  },
180
  {
181
+ "epoch": 0.006850109262810507,
182
+ "grad_norm": 0.6111990213394165,
183
+ "learning_rate": 0.00013714285714285716,
184
+ "loss": 2.7437,
185
  "step": 24
186
  },
187
  {
188
+ "epoch": 0.007135530482094278,
189
+ "grad_norm": 0.5657336115837097,
190
+ "learning_rate": 0.00014285714285714287,
191
+ "loss": 2.6898,
192
  "step": 25
193
  },
194
  {
195
+ "epoch": 0.007135530482094278,
196
+ "eval_loss": 2.7006003856658936,
197
+ "eval_runtime": 6.6882,
198
+ "eval_samples_per_second": 7.476,
199
+ "eval_steps_per_second": 7.476,
200
  "step": 25
201
  },
202
  {
203
+ "epoch": 0.00742095170137805,
204
+ "grad_norm": 0.5724318623542786,
205
+ "learning_rate": 0.00014857142857142857,
206
+ "loss": 2.6936,
207
  "step": 26
208
  },
209
  {
210
+ "epoch": 0.007706372920661821,
211
+ "grad_norm": 0.47816792130470276,
212
+ "learning_rate": 0.0001542857142857143,
213
+ "loss": 2.6312,
214
  "step": 27
215
  },
216
  {
217
+ "epoch": 0.007991794139945592,
218
+ "grad_norm": 0.5189191102981567,
219
+ "learning_rate": 0.00016,
220
+ "loss": 2.8591,
221
  "step": 28
222
  },
223
  {
224
+ "epoch": 0.008277215359229362,
225
+ "grad_norm": 0.5170201063156128,
226
+ "learning_rate": 0.00016571428571428575,
227
+ "loss": 2.5753,
228
  "step": 29
229
  },
230
  {
231
+ "epoch": 0.008562636578513133,
232
+ "grad_norm": 0.4780126214027405,
233
+ "learning_rate": 0.00017142857142857143,
234
+ "loss": 2.5154,
235
  "step": 30
236
  },
237
  {
238
+ "epoch": 0.008848057797796904,
239
+ "grad_norm": 0.46953848004341125,
240
+ "learning_rate": 0.00017714285714285713,
241
+ "loss": 2.5638,
242
  "step": 31
243
  },
244
  {
245
+ "epoch": 0.009133479017080675,
246
+ "grad_norm": 0.5048589706420898,
247
+ "learning_rate": 0.00018285714285714286,
248
+ "loss": 2.6244,
249
  "step": 32
250
  },
251
  {
252
+ "epoch": 0.009418900236364448,
253
+ "grad_norm": 0.5477012395858765,
254
+ "learning_rate": 0.00018857142857142857,
255
+ "loss": 2.7039,
256
  "step": 33
257
  },
258
  {
259
+ "epoch": 0.009704321455648219,
260
+ "grad_norm": 0.510737955570221,
261
+ "learning_rate": 0.0001942857142857143,
262
+ "loss": 2.6319,
263
  "step": 34
264
  },
265
  {
266
+ "epoch": 0.00998974267493199,
267
+ "grad_norm": 0.5701810121536255,
268
+ "learning_rate": 0.0002,
269
+ "loss": 2.6441,
270
  "step": 35
271
  },
272
  {
273
+ "epoch": 0.01027516389421576,
274
+ "grad_norm": 0.49409958720207214,
275
+ "learning_rate": 0.00019999938960115114,
276
+ "loss": 2.5378,
277
  "step": 36
278
  },
279
  {
280
+ "epoch": 0.010560585113499532,
281
+ "grad_norm": 0.5363864302635193,
282
+ "learning_rate": 0.0001999975584128843,
283
+ "loss": 2.4543,
284
  "step": 37
285
  },
286
  {
287
+ "epoch": 0.010846006332783303,
288
+ "grad_norm": 0.511552095413208,
289
+ "learning_rate": 0.00019999450646003843,
290
+ "loss": 2.6913,
291
  "step": 38
292
  },
293
  {
294
+ "epoch": 0.011131427552067074,
295
+ "grad_norm": 0.568918764591217,
296
+ "learning_rate": 0.0001999902337840116,
297
+ "loss": 2.5901,
298
  "step": 39
299
  },
300
  {
301
+ "epoch": 0.011416848771350845,
302
+ "grad_norm": 0.5946168899536133,
303
+ "learning_rate": 0.00019998474044276,
304
+ "loss": 2.6054,
305
  "step": 40
306
  },
307
  {
308
+ "epoch": 0.011702269990634616,
309
+ "grad_norm": 0.6025540232658386,
310
+ "learning_rate": 0.00019997802651079778,
311
+ "loss": 2.7296,
312
  "step": 41
313
  },
314
  {
315
+ "epoch": 0.011987691209918388,
316
+ "grad_norm": 0.589484453201294,
317
+ "learning_rate": 0.00019997009207919545,
318
+ "loss": 2.6412,
319
  "step": 42
320
  },
321
  {
322
+ "epoch": 0.01227311242920216,
323
+ "grad_norm": 0.6304107308387756,
324
+ "learning_rate": 0.00019996093725557898,
325
+ "loss": 2.6191,
326
  "step": 43
327
  },
328
  {
329
+ "epoch": 0.01255853364848593,
330
+ "grad_norm": 0.6326958537101746,
331
+ "learning_rate": 0.00019995056216412824,
332
+ "loss": 2.5749,
333
  "step": 44
334
  },
335
  {
336
+ "epoch": 0.012843954867769701,
337
+ "grad_norm": 0.6734350919723511,
338
+ "learning_rate": 0.0001999389669455753,
339
+ "loss": 2.7463,
340
  "step": 45
341
  },
342
  {
343
+ "epoch": 0.013129376087053472,
344
+ "grad_norm": 0.6757857203483582,
345
+ "learning_rate": 0.00019992615175720257,
346
+ "loss": 2.4429,
347
  "step": 46
348
  },
349
  {
350
+ "epoch": 0.013414797306337243,
351
+ "grad_norm": 0.687453031539917,
352
+ "learning_rate": 0.00019991211677284062,
353
+ "loss": 2.595,
354
  "step": 47
355
  },
356
  {
357
+ "epoch": 0.013700218525621014,
358
+ "grad_norm": 0.7763611078262329,
359
+ "learning_rate": 0.00019989686218286587,
360
+ "loss": 2.7578,
361
  "step": 48
362
  },
363
  {
364
+ "epoch": 0.013985639744904785,
365
+ "grad_norm": 0.9888719916343689,
366
+ "learning_rate": 0.00019988038819419797,
367
+ "loss": 2.5648,
368
  "step": 49
369
  },
370
  {
371
+ "epoch": 0.014271060964188556,
372
+ "grad_norm": 1.6425894498825073,
373
+ "learning_rate": 0.00019986269503029697,
374
+ "loss": 2.5132,
375
  "step": 50
376
  },
377
  {
378
+ "epoch": 0.014271060964188556,
379
+ "eval_loss": 2.556696653366089,
380
+ "eval_runtime": 6.6833,
381
+ "eval_samples_per_second": 7.481,
382
+ "eval_steps_per_second": 7.481,
383
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  }
385
  ],
386
  "logging_steps": 1,
387
+ "max_steps": 888,
388
  "num_input_tokens_seen": 0,
389
  "num_train_epochs": 1,
390
  "save_steps": 50,
 
409
  "attributes": {}
410
  }
411
  },
412
+ "total_flos": 5.593179412129382e+16,
413
+ "train_batch_size": 1,
414
  "trial_name": null,
415
  "trial_params": null
416
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:149f14fb729cab896091dbb95460bb98759373752ba272a6ae1f5cde47ccec9e
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c10884f4be4e91add9795afd80f7b25e35f322adac12d6b37b6d44aa702d50c
3
  size 6776