Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state_0.pth +1 -1
rng_state_1.pth +1 -1
rng_state_2.pth +1 -1
rng_state_3.pth +1 -1
rng_state_4.pth +1 -1
rng_state_5.pth +1 -1
rng_state_6.pth +1 -1
rng_state_7.pth +1 -1
rng_state_8.pth +1 -1
rng_state_9.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +1424 -3

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9b2c4db3a2d556ee001eb8fdf128d644b789b40b2f7cf64684b6fe78989053b
 size 213625344

 version https://git-lfs.github.com/spec/v1
+oid sha256:b0f1a637ff5efd3a742cf0adbf6c58e6934d9a4a6215f862074742126fcf08c2
 size 213625344

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:129003e0eae631ea59961c42baa93d6ece566b523e5b57d228356535ea34946d
 size 427334458

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf290f7d31c224d60ac5c8ed732a0ecc556301b4c61d776c8545f9109d532312
 size 427334458

rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:593cfa780b4b09ba583a139eb81c2eae72992c19fc5f8f38c81bd37ea47dbe04
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:435089f27ea78cb1b5ffb371da67f47c7a3dea92ab07479122e62b4e8dbeed97
 size 16433

rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9560f8c460a856a55828494146d2d52ecf0d95a3dec5919d8f29a972450cec34
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:35c3205fe632396691980ab13bb747592db3b39a8f9bc42c6b4bce2ebc4e86d2
 size 16433

rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e7b7bce88125710e1c78de933cf62b48d6ec5a97b36fc43a09d7f70aebd0307
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:bdd18dda129c8617269378af8c4207e690d6ed4efdbda0ce1aa5947221052d4a
 size 16433

rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4cb51f987fa17a879db4d368b0564564ae49379d5c5ce803d79d24b4b5a43c13
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:7d14c531768fd1b817e6ad83f2878c07af9aa939a5372ffc3020b84164720063
 size 16433

rng_state_4.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33ed1a64882ec192e7bdbd0b9dda7c3dd977bc8ef889d26ddca3e2380d9debae
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:6343b5ecd40e08336f425dc8913f4b43aad1a7465797446cfa1892e3cb3133f3
 size 16433

rng_state_5.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4dd0f015bb5215ff40f1555f5c47c89a7bd89b00e7ef4568ca045dc1c2b5514a
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:0492bd561ad6444b11560f52c5a570d34e2dcc461489aae4bd2703fa55d13f47
 size 16433

rng_state_6.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac5bc7ba9f4a9e405864d41e902bc7509a5b6fa554a6cf09f24491e00dac06fb
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:7a181670929d07bdff88356f8cc3404c6ef87b1e788fc5b1dc6f473ac9b2bc12
 size 16433

rng_state_7.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77e384968f192bdc497600d0108b82695a12247413143c0c9bd4e09fbb718212
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:23b0154dd6c58d1151b932c0d3de52258209b2d646a7e6e34de818d4dfa12a13
 size 16433

rng_state_8.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc46c9518ec829f507eb5833115c977024d13a12bc4e0ecff2238d818e6eb6dc
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:a0c53426dcd44d43fe238422e433fb361c04df5585664ed42f08da1365fc112f
 size 16433

rng_state_9.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:56cb5d3bf2f9602568c76013ebf1c626061418df7f27c779c448921d362d5232
 size 16433

 version https://git-lfs.github.com/spec/v1
+oid sha256:268b7b1ca55cb9d2d96542e3789b74164ec07561d7774458ecef281e2a3ad163
 size 16433

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:83a9c85c7b2c29125f99f000e54e900b05be0859260af7e4a0abf634beb2c469
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:7fc0c0b23c1fbf47eda082a36b0831feacf86e3aa80c09efc720587a7e961c90
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.3998792426790874,
   "eval_steps": 2318,
-  "global_step": 9272,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2856,6 +2856,1427 @@
       "eval_samples_per_second": 610.045,
       "eval_steps_per_second": 7.626,
       "step": 9272
     }
   ],
   "logging_steps": 23,
@@ -2875,7 +4296,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 6.778106242599485e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.5998188640186312,
   "eval_steps": 2318,
+  "global_step": 13908,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 610.045,
       "eval_steps_per_second": 7.626,
       "step": 9272
+    },
+    {
+      "epoch": 0.4007417949713201,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0005998532205145915,
+      "loss": 1.0191,
+      "step": 9292
+    },
+    {
+      "epoch": 0.4017337301073878,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0005988603004662408,
+      "loss": 1.0074,
+      "step": 9315
+    },
+    {
+      "epoch": 0.4027256652434554,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0005978673804178898,
+      "loss": 1.0052,
+      "step": 9338
+    },
+    {
+      "epoch": 0.40371760037952303,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.000596874460369539,
+      "loss": 1.0248,
+      "step": 9361
+    },
+    {
+      "epoch": 0.40470953551559063,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.000595881540321188,
+      "loss": 1.0328,
+      "step": 9384
+    },
+    {
+      "epoch": 0.40570147065165824,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0005948886202728372,
+      "loss": 1.0254,
+      "step": 9407
+    },
+    {
+      "epoch": 0.4066934057877259,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0005938957002244862,
+      "loss": 1.0285,
+      "step": 9430
+    },
+    {
+      "epoch": 0.4076853409237935,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0005929027801761354,
+      "loss": 1.0077,
+      "step": 9453
+    },
+    {
+      "epoch": 0.40867727605986115,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0005919098601277845,
+      "loss": 1.0181,
+      "step": 9476
+    },
+    {
+      "epoch": 0.40966921119592875,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0005909169400794336,
+      "loss": 1.0427,
+      "step": 9499
+    },
+    {
+      "epoch": 0.41066114633199635,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0005899240200310827,
+      "loss": 1.0133,
+      "step": 9522
+    },
+    {
+      "epoch": 0.411653081468064,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0005889310999827318,
+      "loss": 1.0187,
+      "step": 9545
+    },
+    {
+      "epoch": 0.4126450166041316,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.000587938179934381,
+      "loss": 1.0091,
+      "step": 9568
+    },
+    {
+      "epoch": 0.41363695174019927,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0005869452598860301,
+      "loss": 1.002,
+      "step": 9591
+    },
+    {
+      "epoch": 0.41462888687626687,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0005859523398376792,
+      "loss": 1.0321,
+      "step": 9614
+    },
+    {
+      "epoch": 0.4156208220123345,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0005849594197893283,
+      "loss": 1.0244,
+      "step": 9637
+    },
+    {
+      "epoch": 0.4166127571484021,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0005839664997409773,
+      "loss": 1.0345,
+      "step": 9660
+    },
+    {
+      "epoch": 0.4176046922844697,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0005829735796926266,
+      "loss": 1.0157,
+      "step": 9683
+    },
+    {
+      "epoch": 0.4185966274205374,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0005819806596442756,
+      "loss": 0.9774,
+      "step": 9706
+    },
+    {
+      "epoch": 0.419588562556605,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0005809877395959248,
+      "loss": 1.0011,
+      "step": 9729
+    },
+    {
+      "epoch": 0.42058049769267264,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0005799948195475738,
+      "loss": 1.0148,
+      "step": 9752
+    },
+    {
+      "epoch": 0.42157243282874024,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.000579001899499223,
+      "loss": 1.0058,
+      "step": 9775
+    },
+    {
+      "epoch": 0.42256436796480784,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.000578008979450872,
+      "loss": 1.0241,
+      "step": 9798
+    },
+    {
+      "epoch": 0.4235563031008755,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0005770160594025212,
+      "loss": 1.0067,
+      "step": 9821
+    },
+    {
+      "epoch": 0.4245482382369431,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0005760231393541703,
+      "loss": 1.0105,
+      "step": 9844
+    },
+    {
+      "epoch": 0.42554017337301075,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0005750302193058193,
+      "loss": 1.0226,
+      "step": 9867
+    },
+    {
+      "epoch": 0.42653210850907836,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0005740372992574685,
+      "loss": 1.0243,
+      "step": 9890
+    },
+    {
+      "epoch": 0.427524043645146,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0005730443792091175,
+      "loss": 1.0128,
+      "step": 9913
+    },
+    {
+      "epoch": 0.4285159787812136,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0005720514591607668,
+      "loss": 0.9993,
+      "step": 9936
+    },
+    {
+      "epoch": 0.4295079139172812,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0005710585391124158,
+      "loss": 1.012,
+      "step": 9959
+    },
+    {
+      "epoch": 0.43049984905334887,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.000570065619064065,
+      "loss": 1.0178,
+      "step": 9982
+    },
+    {
+      "epoch": 0.43149178418941647,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.000569072699015714,
+      "loss": 1.0192,
+      "step": 10005
+    },
+    {
+      "epoch": 0.43248371932548413,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0005680797789673631,
+      "loss": 0.9992,
+      "step": 10028
+    },
+    {
+      "epoch": 0.43347565446155173,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0005670868589190123,
+      "loss": 1.0033,
+      "step": 10051
+    },
+    {
+      "epoch": 0.43446758959761933,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0005660939388706614,
+      "loss": 1.0153,
+      "step": 10074
+    },
+    {
+      "epoch": 0.435459524733687,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0005651010188223105,
+      "loss": 0.997,
+      "step": 10097
+    },
+    {
+      "epoch": 0.4364514598697546,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0005641080987739596,
+      "loss": 1.0147,
+      "step": 10120
+    },
+    {
+      "epoch": 0.43744339500582224,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0005631151787256087,
+      "loss": 1.0046,
+      "step": 10143
+    },
+    {
+      "epoch": 0.43843533014188985,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0005621222586772579,
+      "loss": 1.0008,
+      "step": 10166
+    },
+    {
+      "epoch": 0.4394272652779575,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0005611293386289069,
+      "loss": 1.0315,
+      "step": 10189
+    },
+    {
+      "epoch": 0.4404192004140251,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0005601364185805561,
+      "loss": 1.0164,
+      "step": 10212
+    },
+    {
+      "epoch": 0.4414111355500927,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0005591434985322051,
+      "loss": 1.0044,
+      "step": 10235
+    },
+    {
+      "epoch": 0.44240307068616036,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0005581505784838543,
+      "loss": 1.0236,
+      "step": 10258
+    },
+    {
+      "epoch": 0.44339500582222796,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0005571576584355034,
+      "loss": 1.0156,
+      "step": 10281
+    },
+    {
+      "epoch": 0.4443869409582956,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0005561647383871526,
+      "loss": 0.9916,
+      "step": 10304
+    },
+    {
+      "epoch": 0.4453788760943632,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0005551718183388016,
+      "loss": 0.9961,
+      "step": 10327
+    },
+    {
+      "epoch": 0.4463708112304308,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0005541788982904507,
+      "loss": 1.0021,
+      "step": 10350
+    },
+    {
+      "epoch": 0.4473627463664985,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0005531859782420998,
+      "loss": 1.0219,
+      "step": 10373
+    },
+    {
+      "epoch": 0.4483546815025661,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.000552193058193749,
+      "loss": 0.9982,
+      "step": 10396
+    },
+    {
+      "epoch": 0.44934661663863373,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0005512001381453981,
+      "loss": 0.994,
+      "step": 10419
+    },
+    {
+      "epoch": 0.45033855177470133,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0005502072180970471,
+      "loss": 0.9899,
+      "step": 10442
+    },
+    {
+      "epoch": 0.451330486910769,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0005492142980486963,
+      "loss": 1.0096,
+      "step": 10465
+    },
+    {
+      "epoch": 0.4523224220468366,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0005482213780003453,
+      "loss": 0.9882,
+      "step": 10488
+    },
+    {
+      "epoch": 0.4533143571829042,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0005472284579519945,
+      "loss": 0.999,
+      "step": 10511
+    },
+    {
+      "epoch": 0.45430629231897185,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0005462355379036436,
+      "loss": 1.0087,
+      "step": 10534
+    },
+    {
+      "epoch": 0.45529822745503945,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0005452426178552927,
+      "loss": 0.9985,
+      "step": 10557
+    },
+    {
+      "epoch": 0.4562901625911071,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0005442496978069418,
+      "loss": 1.0104,
+      "step": 10580
+    },
+    {
+      "epoch": 0.4572820977271747,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0005432567777585909,
+      "loss": 1.0056,
+      "step": 10603
+    },
+    {
+      "epoch": 0.4582740328632423,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00054226385771024,
+      "loss": 1.0115,
+      "step": 10626
+    },
+    {
+      "epoch": 0.45926596799930997,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0005412709376618892,
+      "loss": 1.0143,
+      "step": 10649
+    },
+    {
+      "epoch": 0.46025790313537757,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0005402780176135383,
+      "loss": 0.9916,
+      "step": 10672
+    },
+    {
+      "epoch": 0.4612498382714452,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0005392850975651874,
+      "loss": 0.9967,
+      "step": 10695
+    },
+    {
+      "epoch": 0.4622417734075128,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0005382921775168364,
+      "loss": 1.0009,
+      "step": 10718
+    },
+    {
+      "epoch": 0.4632337085435805,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0005372992574684856,
+      "loss": 0.9919,
+      "step": 10741
+    },
+    {
+      "epoch": 0.4642256436796481,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0005363063374201347,
+      "loss": 1.0128,
+      "step": 10764
+    },
+    {
+      "epoch": 0.4652175788157157,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0005353134173717839,
+      "loss": 0.9982,
+      "step": 10787
+    },
+    {
+      "epoch": 0.46620951395178334,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0005343204973234329,
+      "loss": 0.9998,
+      "step": 10810
+    },
+    {
+      "epoch": 0.46720144908785094,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0005333275772750821,
+      "loss": 1.0177,
+      "step": 10833
+    },
+    {
+      "epoch": 0.4681933842239186,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0005323346572267311,
+      "loss": 0.9899,
+      "step": 10856
+    },
+    {
+      "epoch": 0.4691853193599862,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0005313417371783802,
+      "loss": 1.0052,
+      "step": 10879
+    },
+    {
+      "epoch": 0.4701772544960538,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0005303488171300294,
+      "loss": 0.9741,
+      "step": 10902
+    },
+    {
+      "epoch": 0.47116918963212145,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0005293558970816785,
+      "loss": 1.0021,
+      "step": 10925
+    },
+    {
+      "epoch": 0.47216112476818906,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0005283629770333276,
+      "loss": 0.9896,
+      "step": 10948
+    },
+    {
+      "epoch": 0.4731530599042567,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0005273700569849767,
+      "loss": 1.0046,
+      "step": 10971
+    },
+    {
+      "epoch": 0.4741449950403243,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0005263771369366258,
+      "loss": 1.0004,
+      "step": 10994
+    },
+    {
+      "epoch": 0.47513693017639197,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0005253842168882749,
+      "loss": 1.0031,
+      "step": 11017
+    },
+    {
+      "epoch": 0.47612886531245957,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0005243912968399241,
+      "loss": 1.007,
+      "step": 11040
+    },
+    {
+      "epoch": 0.47712080044852717,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0005233983767915731,
+      "loss": 1.0046,
+      "step": 11063
+    },
+    {
+      "epoch": 0.47811273558459483,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0005224054567432222,
+      "loss": 0.9956,
+      "step": 11086
+    },
+    {
+      "epoch": 0.47910467072066243,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0005214125366948713,
+      "loss": 1.01,
+      "step": 11109
+    },
+    {
+      "epoch": 0.4800966058567301,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0005204196166465205,
+      "loss": 1.0211,
+      "step": 11132
+    },
+    {
+      "epoch": 0.4810885409927977,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0005194266965981696,
+      "loss": 1.0039,
+      "step": 11155
+    },
+    {
+      "epoch": 0.4820804761288653,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0005184337765498187,
+      "loss": 0.9886,
+      "step": 11178
+    },
+    {
+      "epoch": 0.48307241126493294,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0005174408565014678,
+      "loss": 0.9973,
+      "step": 11201
+    },
+    {
+      "epoch": 0.48406434640100054,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0005164479364531169,
+      "loss": 1.013,
+      "step": 11224
+    },
+    {
+      "epoch": 0.4850562815370682,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.000515455016404766,
+      "loss": 0.9876,
+      "step": 11247
+    },
+    {
+      "epoch": 0.4860482166731358,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0005144620963564152,
+      "loss": 0.9984,
+      "step": 11270
+    },
+    {
+      "epoch": 0.48704015180920346,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0005134691763080642,
+      "loss": 0.9875,
+      "step": 11293
+    },
+    {
+      "epoch": 0.48803208694527106,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0005124762562597134,
+      "loss": 0.9954,
+      "step": 11316
+    },
+    {
+      "epoch": 0.48902402208133866,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0005114833362113624,
+      "loss": 0.9825,
+      "step": 11339
+    },
+    {
+      "epoch": 0.4900159572174063,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0005104904161630117,
+      "loss": 0.9983,
+      "step": 11362
+    },
+    {
+      "epoch": 0.4910078923534739,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0005094974961146607,
+      "loss": 0.9818,
+      "step": 11385
+    },
+    {
+      "epoch": 0.4919998274895416,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0005085045760663098,
+      "loss": 0.9928,
+      "step": 11408
+    },
+    {
+      "epoch": 0.4929917626256092,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0005075116560179589,
+      "loss": 0.9771,
+      "step": 11431
+    },
+    {
+      "epoch": 0.4939836977616768,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.000506518735969608,
+      "loss": 1.0059,
+      "step": 11454
+    },
+    {
+      "epoch": 0.49497563289774443,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0005055258159212571,
+      "loss": 1.0058,
+      "step": 11477
+    },
+    {
+      "epoch": 0.49596756803381203,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0005045328958729063,
+      "loss": 0.9962,
+      "step": 11500
+    },
+    {
+      "epoch": 0.4969595031698797,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0005035399758245554,
+      "loss": 0.9828,
+      "step": 11523
+    },
+    {
+      "epoch": 0.4979514383059473,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0005025470557762045,
+      "loss": 0.9881,
+      "step": 11546
+    },
+    {
+      "epoch": 0.49894337344201495,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0005015541357278536,
+      "loss": 0.9863,
+      "step": 11569
+    },
+    {
+      "epoch": 0.49984905334885926,
+      "eval_runtime": 163.9862,
+      "eval_samples_per_second": 609.807,
+      "eval_steps_per_second": 7.623,
+      "step": 11590
+    },
+    {
+      "epoch": 0.49993530857808255,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0005005612156795026,
+      "loss": 0.9764,
+      "step": 11592
+    },
+    {
+      "epoch": 0.5009272437141502,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0004995682956311518,
+      "loss": 0.9923,
+      "step": 11615
+    },
+    {
+      "epoch": 0.5019191788502178,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0004985753755828009,
+      "loss": 0.9738,
+      "step": 11638
+    },
+    {
+      "epoch": 0.5029111139862854,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00049758245553445,
+      "loss": 0.973,
+      "step": 11661
+    },
+    {
+      "epoch": 0.5039030491223531,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0004965895354860991,
+      "loss": 0.9741,
+      "step": 11684
+    },
+    {
+      "epoch": 0.5048949842584207,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0004955966154377482,
+      "loss": 0.9842,
+      "step": 11707
+    },
+    {
+      "epoch": 0.5058869193944883,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0004946036953893974,
+      "loss": 0.9927,
+      "step": 11730
+    },
+    {
+      "epoch": 0.5068788545305559,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0004936107753410465,
+      "loss": 0.9921,
+      "step": 11753
+    },
+    {
+      "epoch": 0.5078707896666236,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0004926178552926956,
+      "loss": 0.9827,
+      "step": 11776
+    },
+    {
+      "epoch": 0.5088627248026911,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0004916249352443447,
+      "loss": 0.9836,
+      "step": 11799
+    },
+    {
+      "epoch": 0.5098546599387588,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0004906320151959938,
+      "loss": 0.9783,
+      "step": 11822
+    },
+    {
+      "epoch": 0.5108465950748264,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0004896390951476428,
+      "loss": 1.0003,
+      "step": 11845
+    },
+    {
+      "epoch": 0.511838530210894,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.000488646175099292,
+      "loss": 0.995,
+      "step": 11868
+    },
+    {
+      "epoch": 0.5128304653469616,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0004876532550509411,
+      "loss": 0.9952,
+      "step": 11891
+    },
+    {
+      "epoch": 0.5138224004830293,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0004866603350025902,
+      "loss": 0.9912,
+      "step": 11914
+    },
+    {
+      "epoch": 0.514814335619097,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00048566741495423933,
+      "loss": 0.995,
+      "step": 11937
+    },
+    {
+      "epoch": 0.5158062707551645,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00048467449490588845,
+      "loss": 0.9856,
+      "step": 11960
+    },
+    {
+      "epoch": 0.5167982058912322,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.00048368157485753757,
+      "loss": 0.9887,
+      "step": 11983
+    },
+    {
+      "epoch": 0.5177901410272998,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0004826886548091867,
+      "loss": 0.9815,
+      "step": 12006
+    },
+    {
+      "epoch": 0.5187820761633674,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00048169573476083575,
+      "loss": 0.9944,
+      "step": 12029
+    },
+    {
+      "epoch": 0.519774011299435,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0004807028147124849,
+      "loss": 0.9714,
+      "step": 12052
+    },
+    {
+      "epoch": 0.5207659464355027,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.000479709894664134,
+      "loss": 0.9849,
+      "step": 12075
+    },
+    {
+      "epoch": 0.5217578815715703,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0004787169746157831,
+      "loss": 0.9861,
+      "step": 12098
+    },
+    {
+      "epoch": 0.5227498167076379,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.00047772405456743223,
+      "loss": 1.0009,
+      "step": 12121
+    },
+    {
+      "epoch": 0.5237417518437055,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00047673113451908135,
+      "loss": 0.9892,
+      "step": 12144
+    },
+    {
+      "epoch": 0.5247336869797732,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00047573821447073047,
+      "loss": 0.9843,
+      "step": 12167
+    },
+    {
+      "epoch": 0.5257256221158407,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0004747452944223796,
+      "loss": 0.9767,
+      "step": 12190
+    },
+    {
+      "epoch": 0.5267175572519084,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00047375237437402866,
+      "loss": 0.9599,
+      "step": 12213
+    },
+    {
+      "epoch": 0.527709492387976,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0004727594543256778,
+      "loss": 0.9797,
+      "step": 12236
+    },
+    {
+      "epoch": 0.5287014275240437,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0004717665342773269,
+      "loss": 0.9952,
+      "step": 12259
+    },
+    {
+      "epoch": 0.5296933626601112,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.000470773614228976,
+      "loss": 0.9851,
+      "step": 12282
+    },
+    {
+      "epoch": 0.5306852977961789,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.00046978069418062514,
+      "loss": 0.9728,
+      "step": 12305
+    },
+    {
+      "epoch": 0.5316772329322466,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00046878777413227426,
+      "loss": 0.9813,
+      "step": 12328
+    },
+    {
+      "epoch": 0.5326691680683141,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0004677948540839234,
+      "loss": 0.9729,
+      "step": 12351
+    },
+    {
+      "epoch": 0.5336611032043818,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0004668019340355725,
+      "loss": 0.969,
+      "step": 12374
+    },
+    {
+      "epoch": 0.5346530383404494,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00046580901398722156,
+      "loss": 0.9786,
+      "step": 12397
+    },
+    {
+      "epoch": 0.535644973476517,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00046481609393887063,
+      "loss": 0.9773,
+      "step": 12420
+    },
+    {
+      "epoch": 0.5366369086125846,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00046382317389051975,
+      "loss": 0.9972,
+      "step": 12443
+    },
+    {
+      "epoch": 0.5376288437486523,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00046283025384216887,
+      "loss": 0.9893,
+      "step": 12466
+    },
+    {
+      "epoch": 0.5386207788847199,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.000461837333793818,
+      "loss": 0.9747,
+      "step": 12489
+    },
+    {
+      "epoch": 0.5396127140207875,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0004608444137454671,
+      "loss": 0.9795,
+      "step": 12512
+    },
+    {
+      "epoch": 0.5406046491568551,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00045985149369711623,
+      "loss": 0.9608,
+      "step": 12535
+    },
+    {
+      "epoch": 0.5415965842929228,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00045885857364876535,
+      "loss": 0.966,
+      "step": 12558
+    },
+    {
+      "epoch": 0.5425885194289903,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0004578656536004144,
+      "loss": 0.9689,
+      "step": 12581
+    },
+    {
+      "epoch": 0.543580454565058,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00045687273355206353,
+      "loss": 0.9655,
+      "step": 12604
+    },
+    {
+      "epoch": 0.5445723897011256,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00045587981350371265,
+      "loss": 0.9693,
+      "step": 12627
+    },
+    {
+      "epoch": 0.5455643248371933,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00045488689345536177,
+      "loss": 0.9986,
+      "step": 12650
+    },
+    {
+      "epoch": 0.5465562599732608,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0004538939734070109,
+      "loss": 0.9715,
+      "step": 12673
+    },
+    {
+      "epoch": 0.5475481951093285,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00045290105335866,
+      "loss": 0.9781,
+      "step": 12696
+    },
+    {
+      "epoch": 0.5485401302453962,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00045190813331030913,
+      "loss": 1.0001,
+      "step": 12719
+    },
+    {
+      "epoch": 0.5495320653814637,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0004509152132619582,
+      "loss": 0.9811,
+      "step": 12742
+    },
+    {
+      "epoch": 0.5505240005175314,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0004499222932136073,
+      "loss": 0.9584,
+      "step": 12765
+    },
+    {
+      "epoch": 0.551515935653599,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00044892937316525644,
+      "loss": 0.977,
+      "step": 12788
+    },
+    {
+      "epoch": 0.5525078707896667,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00044793645311690556,
+      "loss": 0.9746,
+      "step": 12811
+    },
+    {
+      "epoch": 0.5534998059257342,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0004469435330685547,
+      "loss": 0.9811,
+      "step": 12834
+    },
+    {
+      "epoch": 0.5544917410618019,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0004459506130202038,
+      "loss": 0.9523,
+      "step": 12857
+    },
+    {
+      "epoch": 0.5554836761978695,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0004449576929718529,
+      "loss": 0.9641,
+      "step": 12880
+    },
+    {
+      "epoch": 0.5564756113339371,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00044396477292350204,
+      "loss": 0.9845,
+      "step": 12903
+    },
+    {
+      "epoch": 0.5574675464700047,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0004429718528751511,
+      "loss": 0.9788,
+      "step": 12926
+    },
+    {
+      "epoch": 0.5584594816060724,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0004419789328268002,
+      "loss": 0.9795,
+      "step": 12949
+    },
+    {
+      "epoch": 0.5594514167421399,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00044098601277844934,
+      "loss": 0.9716,
+      "step": 12972
+    },
+    {
+      "epoch": 0.5604433518782076,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00043999309273009846,
+      "loss": 0.9814,
+      "step": 12995
+    },
+    {
+      "epoch": 0.5614352870142753,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00043900017268174753,
+      "loss": 0.9724,
+      "step": 13018
+    },
+    {
+      "epoch": 0.5624272221503429,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00043800725263339665,
+      "loss": 0.9538,
+      "step": 13041
+    },
+    {
+      "epoch": 0.5634191572864105,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00043701433258504577,
+      "loss": 0.9744,
+      "step": 13064
+    },
+    {
+      "epoch": 0.5644110924224781,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0004360214125366949,
+      "loss": 0.9777,
+      "step": 13087
+    },
+    {
+      "epoch": 0.5654030275585458,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00043502849248834395,
+      "loss": 0.9688,
+      "step": 13110
+    },
+    {
+      "epoch": 0.5663949626946133,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.00043403557243999307,
+      "loss": 0.988,
+      "step": 13133
+    },
+    {
+      "epoch": 0.567386897830681,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0004330426523916422,
+      "loss": 0.9678,
+      "step": 13156
+    },
+    {
+      "epoch": 0.5683788329667486,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0004320497323432913,
+      "loss": 0.9735,
+      "step": 13179
+    },
+    {
+      "epoch": 0.5693707681028163,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00043105681229494043,
+      "loss": 0.9612,
+      "step": 13202
+    },
+    {
+      "epoch": 0.5703627032388838,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00043006389224658955,
+      "loss": 0.9428,
+      "step": 13225
+    },
+    {
+      "epoch": 0.5713546383749515,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.00042907097219823867,
+      "loss": 0.9654,
+      "step": 13248
+    },
+    {
+      "epoch": 0.5723465735110191,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0004280780521498878,
+      "loss": 0.9739,
+      "step": 13271
+    },
+    {
+      "epoch": 0.5733385086470867,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00042708513210153686,
+      "loss": 0.9755,
+      "step": 13294
+    },
+    {
+      "epoch": 0.5743304437831543,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000426092212053186,
+      "loss": 0.9784,
+      "step": 13317
+    },
+    {
+      "epoch": 0.575322378919222,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0004250992920048351,
+      "loss": 0.9625,
+      "step": 13340
+    },
+    {
+      "epoch": 0.5763143140552897,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0004241063719564842,
+      "loss": 0.9521,
+      "step": 13363
+    },
+    {
+      "epoch": 0.5773062491913572,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00042311345190813334,
+      "loss": 0.984,
+      "step": 13386
+    },
+    {
+      "epoch": 0.5782981843274249,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00042212053185978246,
+      "loss": 0.9794,
+      "step": 13409
+    },
+    {
+      "epoch": 0.5792901194634925,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0004211276118114316,
+      "loss": 0.9705,
+      "step": 13432
+    },
+    {
+      "epoch": 0.5802820545995601,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00042013469176308064,
+      "loss": 0.97,
+      "step": 13455
+    },
+    {
+      "epoch": 0.5812739897356277,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00041914177171472976,
+      "loss": 0.968,
+      "step": 13478
+    },
+    {
+      "epoch": 0.5822659248716954,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0004181488516663789,
+      "loss": 0.9664,
+      "step": 13501
+    },
+    {
+      "epoch": 0.5832578600077629,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.000417155931618028,
+      "loss": 0.9722,
+      "step": 13524
+    },
+    {
+      "epoch": 0.5842497951438306,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0004161630115696771,
+      "loss": 0.9695,
+      "step": 13547
+    },
+    {
+      "epoch": 0.5852417302798982,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00041517009152132624,
+      "loss": 0.9628,
+      "step": 13570
+    },
+    {
+      "epoch": 0.5862336654159659,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0004141771714729753,
+      "loss": 0.9515,
+      "step": 13593
+    },
+    {
+      "epoch": 0.5872256005520334,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0004131842514246244,
+      "loss": 0.965,
+      "step": 13616
+    },
+    {
+      "epoch": 0.5882175356881011,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0004121913313762735,
+      "loss": 0.9598,
+      "step": 13639
+    },
+    {
+      "epoch": 0.5892094708241687,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0004111984113279226,
+      "loss": 0.9575,
+      "step": 13662
+    },
+    {
+      "epoch": 0.5902014059602363,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.00041020549127957173,
+      "loss": 0.9933,
+      "step": 13685
+    },
+    {
+      "epoch": 0.591193341096304,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00040921257123122085,
+      "loss": 0.9621,
+      "step": 13708
+    },
+    {
+      "epoch": 0.5921852762323716,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00040821965118286997,
+      "loss": 0.964,
+      "step": 13731
+    },
+    {
+      "epoch": 0.5931772113684393,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0004072267311345191,
+      "loss": 0.9854,
+      "step": 13754
+    },
+    {
+      "epoch": 0.5941691465045068,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0004062338110861682,
+      "loss": 0.9883,
+      "step": 13777
+    },
+    {
+      "epoch": 0.5951610816405745,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.00040524089103781733,
+      "loss": 0.9853,
+      "step": 13800
+    },
+    {
+      "epoch": 0.5961530167766421,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0004042479709894664,
+      "loss": 0.9557,
+      "step": 13823
+    },
+    {
+      "epoch": 0.5971449519127097,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0004032550509411155,
+      "loss": 0.9587,
+      "step": 13846
+    },
+    {
+      "epoch": 0.5981368870487773,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00040226213089276464,
+      "loss": 0.9771,
+      "step": 13869
+    },
+    {
+      "epoch": 0.599128822184845,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00040126921084441376,
+      "loss": 0.9661,
+      "step": 13892
+    },
+    {
+      "epoch": 0.5998188640186312,
+      "eval_runtime": 163.7921,
+      "eval_samples_per_second": 610.53,
+      "eval_steps_per_second": 7.632,
+      "step": 13908
     }
   ],
   "logging_steps": 23,
       "attributes": {}
     }
   },
+  "total_flos": 1.0167159364234772e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null