🍻 cheers

Browse files

Files changed (6) hide show

README.md +8 -5
all_results.json +10 -10
eval_results.json +5 -5
runs/Mar29_16-45-17_X5C922065N/events.out.tfevents.1711732953.X5C922065N.53009.5 +3 -0
train_results.json +5 -5
trainer_state.json +511 -728

README.md CHANGED Viewed

@@ -1,6 +1,9 @@
 ---
 base_model: d071696/vit-finetune-scrap
 tags:
 - generated_from_trainer
 datasets:
 - arrow
@@ -13,7 +16,7 @@ model-index:
       name: Image Classification
       type: image-classification
     dataset:
-      name: arrow
       type: arrow
       config: default
       split: train
@@ -21,7 +24,7 @@ model-index:
     metrics:
     - name: Accuracy
       type: accuracy
-      value: 0.9485530546623794
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -29,10 +32,10 @@ should probably proofread and complete it, then remove this comment. -->
 # vit-finetune-scrap
-This model is a fine-tuned version of [d071696/vit-finetune-scrap](https://huggingface.co/d071696/vit-finetune-scrap) on the arrow dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.2143
-- Accuracy: 0.9486
 ## Model description

 ---
 base_model: d071696/vit-finetune-scrap
 tags:
+- image-classification
+- image-feature-extraction
+- image-to-text
 - generated_from_trainer
 datasets:
 - arrow
       name: Image Classification
       type: image-classification
     dataset:
+      name: d071696/scraps1
       type: arrow
       config: default
       split: train
     metrics:
     - name: Accuracy
       type: accuracy
+      value: 0.9963782696177063
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # vit-finetune-scrap
+This model is a fine-tuned version of [d071696/vit-finetune-scrap](https://huggingface.co/d071696/vit-finetune-scrap) on the d071696/scraps1 dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.0129
+- Accuracy: 0.9964
 ## Model description

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 4.0,
-    "eval_accuracy": 1.0,
-    "eval_loss": 0.9895318150520325,
-    "eval_runtime": 1.234,
-    "eval_samples_per_second": 24.312,
-    "eval_steps_per_second": 3.242,
-    "total_flos": 9.63148132192297e+17,
-    "train_loss": 0.48303211886426173,
-    "train_runtime": 1041.8456,
-    "train_samples_per_second": 11.929,
-    "train_steps_per_second": 1.494
 }

 {
     "epoch": 4.0,
+    "eval_accuracy": 0.9963782696177063,
+    "eval_loss": 0.012937591411173344,
+    "eval_runtime": 48.4901,
+    "eval_samples_per_second": 51.248,
+    "eval_steps_per_second": 6.414,
+    "total_flos": 7.703325099767808e+17,
+    "train_loss": 0.1579143282675882,
+    "train_runtime": 491.3754,
+    "train_samples_per_second": 20.229,
+    "train_steps_per_second": 2.532
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 4.0,
-    "eval_accuracy": 1.0,
-    "eval_loss": 0.9895318150520325,
-    "eval_runtime": 1.234,
-    "eval_samples_per_second": 24.312,
-    "eval_steps_per_second": 3.242
 }

 {
     "epoch": 4.0,
+    "eval_accuracy": 0.9963782696177063,
+    "eval_loss": 0.012937591411173344,
+    "eval_runtime": 48.4901,
+    "eval_samples_per_second": 51.248,
+    "eval_steps_per_second": 6.414
 }

runs/Mar29_16-45-17_X5C922065N/events.out.tfevents.1711732953.X5C922065N.53009.5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23139dbd44575df7213bf6b823555c99387ed9f6983bbe6670af966d47b34125
+size 734

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 4.0,
-    "total_flos": 9.63148132192297e+17,
-    "train_loss": 0.48303211886426173,
-    "train_runtime": 1041.8456,
-    "train_samples_per_second": 11.929,
-    "train_steps_per_second": 1.494
 }

 {
     "epoch": 4.0,
+    "total_flos": 7.703325099767808e+17,
+    "train_loss": 0.1579143282675882,
+    "train_runtime": 491.3754,
+    "train_samples_per_second": 20.229,
+    "train_steps_per_second": 2.532
 }

trainer_state.json CHANGED Viewed

@@ -1,1123 +1,906 @@
 {
-  "best_metric": 0.11155818402767181,
   "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-1000",
   "epoch": 4.0,
   "eval_steps": 1000,
-  "global_step": 1556,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.03,
-      "grad_norm": 2.721052646636963,
-      "learning_rate": 0.0001987146529562982,
-      "loss": 2.3309,
       "step": 10
     },
     {
-      "epoch": 0.05,
-      "grad_norm": 2.6302196979522705,
-      "learning_rate": 0.0001974293059125964,
-      "loss": 2.1693,
       "step": 20
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 3.2131187915802,
-      "learning_rate": 0.0001961439588688946,
-      "loss": 1.914,
       "step": 30
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 3.520822525024414,
-      "learning_rate": 0.00019485861182519281,
-      "loss": 1.6374,
       "step": 40
     },
     {
-      "epoch": 0.13,
-      "grad_norm": 4.047000408172607,
-      "learning_rate": 0.000193573264781491,
-      "loss": 1.6214,
       "step": 50
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 3.1879024505615234,
-      "learning_rate": 0.0001922879177377892,
-      "loss": 1.6056,
       "step": 60
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 3.4971909523010254,
-      "learning_rate": 0.00019100257069408743,
-      "loss": 1.4073,
       "step": 70
     },
     {
-      "epoch": 0.21,
-      "grad_norm": 5.953548908233643,
-      "learning_rate": 0.00018971722365038562,
-      "loss": 1.2913,
       "step": 80
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 3.639462471008301,
-      "learning_rate": 0.00018843187660668383,
-      "loss": 1.1548,
       "step": 90
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 5.055682182312012,
-      "learning_rate": 0.00018714652956298202,
-      "loss": 1.3169,
       "step": 100
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 2.787602186203003,
-      "learning_rate": 0.0001858611825192802,
-      "loss": 1.2729,
       "step": 110
     },
     {
-      "epoch": 0.31,
-      "grad_norm": 5.495873928070068,
-      "learning_rate": 0.00018457583547557842,
-      "loss": 1.1431,
       "step": 120
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 2.6707160472869873,
-      "learning_rate": 0.0001832904884318766,
-      "loss": 1.0606,
       "step": 130
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 5.753376483917236,
-      "learning_rate": 0.00018200514138817483,
-      "loss": 1.0528,
       "step": 140
     },
     {
-      "epoch": 0.39,
-      "grad_norm": 4.965968132019043,
-      "learning_rate": 0.000180719794344473,
-      "loss": 1.3918,
       "step": 150
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 4.477539539337158,
-      "learning_rate": 0.0001794344473007712,
-      "loss": 1.2681,
       "step": 160
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 6.264174938201904,
-      "learning_rate": 0.00017814910025706942,
-      "loss": 1.0713,
       "step": 170
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 3.289985179901123,
-      "learning_rate": 0.0001768637532133676,
-      "loss": 1.1408,
       "step": 180
     },
     {
-      "epoch": 0.49,
-      "grad_norm": 4.64877986907959,
-      "learning_rate": 0.00017557840616966582,
-      "loss": 1.3037,
       "step": 190
     },
     {
-      "epoch": 0.51,
-      "grad_norm": 3.4218943119049072,
-      "learning_rate": 0.000174293059125964,
-      "loss": 0.9621,
       "step": 200
     },
     {
-      "epoch": 0.54,
-      "grad_norm": 5.507615566253662,
-      "learning_rate": 0.00017300771208226222,
-      "loss": 1.0243,
       "step": 210
     },
     {
-      "epoch": 0.57,
-      "grad_norm": 4.309627532958984,
-      "learning_rate": 0.00017172236503856043,
-      "loss": 0.9739,
       "step": 220
     },
     {
-      "epoch": 0.59,
-      "grad_norm": 4.056205749511719,
-      "learning_rate": 0.00017043701799485862,
-      "loss": 0.9158,
       "step": 230
     },
     {
-      "epoch": 0.62,
-      "grad_norm": 3.1590564250946045,
-      "learning_rate": 0.00016915167095115684,
-      "loss": 0.8557,
       "step": 240
     },
     {
-      "epoch": 0.64,
-      "grad_norm": 1.6367921829223633,
-      "learning_rate": 0.00016786632390745502,
-      "loss": 1.0898,
       "step": 250
     },
     {
-      "epoch": 0.67,
-      "grad_norm": 5.508506774902344,
-      "learning_rate": 0.0001665809768637532,
-      "loss": 1.0173,
       "step": 260
     },
     {
-      "epoch": 0.69,
-      "grad_norm": 5.602323532104492,
-      "learning_rate": 0.00016529562982005143,
-      "loss": 0.9706,
       "step": 270
     },
     {
-      "epoch": 0.72,
-      "grad_norm": 8.27458381652832,
-      "learning_rate": 0.00016401028277634961,
-      "loss": 1.1064,
       "step": 280
     },
     {
-      "epoch": 0.75,
-      "grad_norm": 3.5698864459991455,
-      "learning_rate": 0.00016272493573264783,
-      "loss": 0.9979,
       "step": 290
     },
     {
-      "epoch": 0.77,
-      "grad_norm": 5.842220783233643,
-      "learning_rate": 0.00016143958868894602,
-      "loss": 1.0221,
       "step": 300
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 3.458761692047119,
-      "learning_rate": 0.00016015424164524423,
-      "loss": 0.9931,
       "step": 310
     },
     {
-      "epoch": 0.82,
-      "grad_norm": 5.971825122833252,
-      "learning_rate": 0.00015886889460154242,
-      "loss": 1.1686,
       "step": 320
     },
     {
-      "epoch": 0.85,
-      "grad_norm": 5.68731689453125,
-      "learning_rate": 0.0001575835475578406,
-      "loss": 0.9805,
       "step": 330
     },
     {
-      "epoch": 0.87,
-      "grad_norm": 5.103214263916016,
-      "learning_rate": 0.00015629820051413882,
-      "loss": 0.8668,
       "step": 340
     },
     {
-      "epoch": 0.9,
-      "grad_norm": 4.177506923675537,
-      "learning_rate": 0.00015501285347043704,
-      "loss": 1.1952,
       "step": 350
     },
     {
-      "epoch": 0.93,
-      "grad_norm": 1.9655299186706543,
-      "learning_rate": 0.00015372750642673522,
-      "loss": 0.8981,
       "step": 360
     },
     {
-      "epoch": 0.95,
-      "grad_norm": 4.982448577880859,
-      "learning_rate": 0.00015244215938303344,
-      "loss": 0.7721,
       "step": 370
     },
     {
-      "epoch": 0.98,
-      "grad_norm": 5.1874775886535645,
-      "learning_rate": 0.00015115681233933163,
-      "loss": 0.97,
       "step": 380
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 4.942078590393066,
-      "learning_rate": 0.00014987146529562984,
-      "loss": 0.8415,
       "step": 390
     },
     {
-      "epoch": 1.03,
-      "grad_norm": 3.160961389541626,
-      "learning_rate": 0.00014858611825192803,
-      "loss": 0.5367,
       "step": 400
     },
     {
-      "epoch": 1.05,
-      "grad_norm": 5.394630432128906,
-      "learning_rate": 0.00014730077120822622,
-      "loss": 0.561,
       "step": 410
     },
     {
-      "epoch": 1.08,
-      "grad_norm": 2.2095775604248047,
-      "learning_rate": 0.00014601542416452443,
-      "loss": 0.5548,
       "step": 420
     },
     {
-      "epoch": 1.11,
-      "grad_norm": 4.9532575607299805,
-      "learning_rate": 0.00014473007712082262,
-      "loss": 0.6005,
       "step": 430
     },
     {
-      "epoch": 1.13,
-      "grad_norm": 5.503066062927246,
-      "learning_rate": 0.00014344473007712083,
-      "loss": 0.514,
       "step": 440
     },
     {
-      "epoch": 1.16,
-      "grad_norm": 5.952071189880371,
-      "learning_rate": 0.00014215938303341902,
-      "loss": 0.5386,
       "step": 450
     },
     {
-      "epoch": 1.18,
-      "grad_norm": 4.198409557342529,
-      "learning_rate": 0.00014087403598971724,
-      "loss": 0.5937,
       "step": 460
     },
     {
-      "epoch": 1.21,
-      "grad_norm": 4.768213272094727,
-      "learning_rate": 0.00013958868894601542,
-      "loss": 0.6504,
       "step": 470
     },
     {
-      "epoch": 1.23,
-      "grad_norm": 4.068699359893799,
-      "learning_rate": 0.0001383033419023136,
-      "loss": 0.421,
       "step": 480
     },
     {
-      "epoch": 1.26,
-      "grad_norm": 4.887763500213623,
-      "learning_rate": 0.00013701799485861185,
-      "loss": 0.5566,
       "step": 490
     },
     {
-      "epoch": 1.29,
-      "grad_norm": 5.322113037109375,
-      "learning_rate": 0.00013573264781491004,
-      "loss": 0.514,
       "step": 500
     },
     {
-      "epoch": 1.31,
-      "grad_norm": 3.480942726135254,
-      "learning_rate": 0.00013444730077120823,
-      "loss": 0.5012,
       "step": 510
     },
     {
-      "epoch": 1.34,
-      "grad_norm": 3.2392122745513916,
-      "learning_rate": 0.00013316195372750644,
-      "loss": 0.5065,
       "step": 520
     },
     {
-      "epoch": 1.36,
-      "grad_norm": 1.8148912191390991,
-      "learning_rate": 0.00013187660668380463,
-      "loss": 0.4932,
       "step": 530
     },
     {
-      "epoch": 1.39,
-      "grad_norm": 1.9780988693237305,
-      "learning_rate": 0.00013059125964010284,
-      "loss": 0.6036,
       "step": 540
     },
     {
-      "epoch": 1.41,
-      "grad_norm": 5.625373840332031,
-      "learning_rate": 0.00012930591259640103,
-      "loss": 0.564,
       "step": 550
     },
     {
-      "epoch": 1.44,
-      "grad_norm": 9.524807929992676,
-      "learning_rate": 0.00012802056555269925,
-      "loss": 0.5695,
       "step": 560
     },
     {
-      "epoch": 1.47,
-      "grad_norm": 1.463976263999939,
-      "learning_rate": 0.00012673521850899743,
-      "loss": 0.3198,
       "step": 570
     },
     {
-      "epoch": 1.49,
-      "grad_norm": 6.108857154846191,
-      "learning_rate": 0.00012544987146529562,
-      "loss": 0.6759,
       "step": 580
     },
     {
-      "epoch": 1.52,
-      "grad_norm": 1.5109316110610962,
-      "learning_rate": 0.00012416452442159384,
-      "loss": 0.4468,
       "step": 590
     },
     {
-      "epoch": 1.54,
-      "grad_norm": 1.9603605270385742,
-      "learning_rate": 0.00012287917737789202,
-      "loss": 0.3569,
       "step": 600
     },
     {
-      "epoch": 1.57,
-      "grad_norm": 7.527422904968262,
-      "learning_rate": 0.00012159383033419023,
-      "loss": 0.6518,
       "step": 610
     },
     {
-      "epoch": 1.59,
-      "grad_norm": 5.3868255615234375,
-      "learning_rate": 0.00012030848329048843,
-      "loss": 0.5278,
       "step": 620
     },
     {
-      "epoch": 1.62,
-      "grad_norm": 8.257445335388184,
-      "learning_rate": 0.00011902313624678665,
-      "loss": 0.6488,
       "step": 630
     },
     {
-      "epoch": 1.65,
-      "grad_norm": 8.786994934082031,
-      "learning_rate": 0.00011773778920308484,
-      "loss": 0.6637,
       "step": 640
     },
     {
-      "epoch": 1.67,
-      "grad_norm": 11.612885475158691,
-      "learning_rate": 0.00011645244215938304,
-      "loss": 0.5637,
       "step": 650
     },
     {
-      "epoch": 1.7,
-      "grad_norm": 4.953100204467773,
-      "learning_rate": 0.00011516709511568124,
-      "loss": 0.3346,
       "step": 660
     },
     {
-      "epoch": 1.72,
-      "grad_norm": 8.756507873535156,
-      "learning_rate": 0.00011388174807197945,
-      "loss": 0.5318,
       "step": 670
     },
     {
-      "epoch": 1.75,
-      "grad_norm": 5.3309760093688965,
-      "learning_rate": 0.00011259640102827765,
-      "loss": 0.433,
       "step": 680
     },
     {
-      "epoch": 1.77,
-      "grad_norm": 0.4981166422367096,
-      "learning_rate": 0.00011131105398457585,
-      "loss": 0.4548,
       "step": 690
     },
     {
-      "epoch": 1.8,
-      "grad_norm": 7.036471366882324,
-      "learning_rate": 0.00011002570694087404,
-      "loss": 0.6301,
       "step": 700
     },
     {
-      "epoch": 1.83,
-      "grad_norm": 5.0402021408081055,
-      "learning_rate": 0.00010874035989717224,
-      "loss": 0.6178,
       "step": 710
     },
     {
-      "epoch": 1.85,
-      "grad_norm": 0.2094542682170868,
-      "learning_rate": 0.00010745501285347044,
-      "loss": 0.3818,
       "step": 720
     },
     {
-      "epoch": 1.88,
-      "grad_norm": 5.399072647094727,
-      "learning_rate": 0.00010616966580976864,
-      "loss": 0.5482,
       "step": 730
     },
     {
-      "epoch": 1.9,
-      "grad_norm": 9.017058372497559,
-      "learning_rate": 0.00010488431876606684,
-      "loss": 0.5286,
       "step": 740
     },
     {
-      "epoch": 1.93,
-      "grad_norm": 2.5559568405151367,
-      "learning_rate": 0.00010359897172236503,
-      "loss": 0.4894,
       "step": 750
     },
     {
-      "epoch": 1.95,
-      "grad_norm": 1.3460350036621094,
-      "learning_rate": 0.00010231362467866323,
-      "loss": 0.369,
       "step": 760
     },
     {
-      "epoch": 1.98,
-      "grad_norm": 0.5879113078117371,
-      "learning_rate": 0.00010102827763496146,
-      "loss": 0.3088,
       "step": 770
     },
     {
-      "epoch": 2.01,
-      "grad_norm": 1.1561224460601807,
-      "learning_rate": 9.974293059125965e-05,
-      "loss": 0.573,
       "step": 780
     },
     {
-      "epoch": 2.03,
-      "grad_norm": 2.361337900161743,
-      "learning_rate": 9.845758354755785e-05,
-      "loss": 0.1348,
       "step": 790
     },
     {
-      "epoch": 2.06,
-      "grad_norm": 2.3323395252227783,
-      "learning_rate": 9.717223650385605e-05,
-      "loss": 0.1292,
       "step": 800
     },
     {
-      "epoch": 2.08,
-      "grad_norm": 0.5499300956726074,
-      "learning_rate": 9.588688946015425e-05,
-      "loss": 0.1817,
       "step": 810
     },
     {
-      "epoch": 2.11,
-      "grad_norm": 0.2054494023323059,
-      "learning_rate": 9.460154241645245e-05,
-      "loss": 0.2232,
       "step": 820
     },
     {
-      "epoch": 2.13,
-      "grad_norm": 2.15979266166687,
-      "learning_rate": 9.331619537275065e-05,
-      "loss": 0.2153,
       "step": 830
     },
     {
-      "epoch": 2.16,
-      "grad_norm": 3.1036410331726074,
-      "learning_rate": 9.203084832904885e-05,
-      "loss": 0.1692,
       "step": 840
     },
     {
-      "epoch": 2.19,
-      "grad_norm": 2.084644317626953,
-      "learning_rate": 9.074550128534704e-05,
-      "loss": 0.2034,
       "step": 850
     },
     {
-      "epoch": 2.21,
-      "grad_norm": 2.1689724922180176,
-      "learning_rate": 8.946015424164524e-05,
-      "loss": 0.2217,
       "step": 860
     },
     {
-      "epoch": 2.24,
-      "grad_norm": 1.0331225395202637,
-      "learning_rate": 8.817480719794346e-05,
-      "loss": 0.1232,
       "step": 870
     },
     {
-      "epoch": 2.26,
-      "grad_norm": 3.129354953765869,
-      "learning_rate": 8.688946015424166e-05,
-      "loss": 0.1363,
       "step": 880
     },
     {
-      "epoch": 2.29,
-      "grad_norm": 0.653751015663147,
-      "learning_rate": 8.560411311053986e-05,
-      "loss": 0.1845,
       "step": 890
     },
     {
-      "epoch": 2.31,
-      "grad_norm": 0.20718339085578918,
-      "learning_rate": 8.431876606683805e-05,
-      "loss": 0.1638,
       "step": 900
     },
     {
-      "epoch": 2.34,
-      "grad_norm": 5.0227274894714355,
-      "learning_rate": 8.303341902313625e-05,
-      "loss": 0.1423,
       "step": 910
     },
     {
-      "epoch": 2.37,
-      "grad_norm": 0.7187924385070801,
-      "learning_rate": 8.174807197943445e-05,
-      "loss": 0.1702,
       "step": 920
     },
     {
-      "epoch": 2.39,
-      "grad_norm": 1.2977266311645508,
-      "learning_rate": 8.046272493573265e-05,
-      "loss": 0.1354,
       "step": 930
     },
     {
-      "epoch": 2.42,
-      "grad_norm": 2.3543667793273926,
-      "learning_rate": 7.917737789203086e-05,
-      "loss": 0.2209,
       "step": 940
     },
     {
-      "epoch": 2.44,
-      "grad_norm": 0.8430781364440918,
-      "learning_rate": 7.789203084832905e-05,
-      "loss": 0.1487,
       "step": 950
     },
     {
-      "epoch": 2.47,
-      "grad_norm": 0.08762349933385849,
-      "learning_rate": 7.660668380462725e-05,
-      "loss": 0.1038,
       "step": 960
     },
     {
-      "epoch": 2.49,
-      "grad_norm": 8.408522605895996,
-      "learning_rate": 7.532133676092545e-05,
-      "loss": 0.2402,
       "step": 970
     },
     {
-      "epoch": 2.52,
-      "grad_norm": 1.173913836479187,
-      "learning_rate": 7.403598971722365e-05,
-      "loss": 0.0641,
       "step": 980
     },
     {
-      "epoch": 2.54,
-      "grad_norm": 7.908231735229492,
-      "learning_rate": 7.275064267352186e-05,
-      "loss": 0.2347,
       "step": 990
     },
     {
-      "epoch": 2.57,
-      "grad_norm": 3.18058180809021,
-      "learning_rate": 7.146529562982006e-05,
-      "loss": 0.1326,
       "step": 1000
     },
     {
-      "epoch": 2.57,
-      "eval_accuracy": 0.9694238815577728,
-      "eval_loss": 0.11155818402767181,
-      "eval_runtime": 52.5851,
-      "eval_samples_per_second": 59.085,
-      "eval_steps_per_second": 7.398,
       "step": 1000
     },
     {
-      "epoch": 2.6,
-      "grad_norm": 0.4821953773498535,
-      "learning_rate": 7.017994858611826e-05,
-      "loss": 0.258,
       "step": 1010
     },
     {
-      "epoch": 2.62,
-      "grad_norm": 4.647073268890381,
-      "learning_rate": 6.889460154241646e-05,
-      "loss": 0.1106,
       "step": 1020
     },
     {
-      "epoch": 2.65,
-      "grad_norm": 0.07687141746282578,
-      "learning_rate": 6.760925449871466e-05,
-      "loss": 0.1768,
       "step": 1030
     },
     {
-      "epoch": 2.67,
-      "grad_norm": 0.8537989854812622,
-      "learning_rate": 6.632390745501286e-05,
-      "loss": 0.1321,
       "step": 1040
     },
     {
-      "epoch": 2.7,
-      "grad_norm": 1.6428909301757812,
-      "learning_rate": 6.503856041131106e-05,
-      "loss": 0.2679,
       "step": 1050
     },
     {
-      "epoch": 2.72,
-      "grad_norm": 0.4707659184932709,
-      "learning_rate": 6.375321336760925e-05,
-      "loss": 0.192,
       "step": 1060
     },
     {
-      "epoch": 2.75,
-      "grad_norm": 0.09739229083061218,
-      "learning_rate": 6.246786632390745e-05,
-      "loss": 0.2501,
       "step": 1070
     },
     {
-      "epoch": 2.78,
-      "grad_norm": 2.0249221324920654,
-      "learning_rate": 6.118251928020567e-05,
-      "loss": 0.1988,
       "step": 1080
     },
     {
-      "epoch": 2.8,
-      "grad_norm": 0.08042796701192856,
-      "learning_rate": 5.989717223650386e-05,
-      "loss": 0.04,
       "step": 1090
     },
     {
-      "epoch": 2.83,
-      "grad_norm": 0.40489840507507324,
-      "learning_rate": 5.861182519280206e-05,
-      "loss": 0.1326,
       "step": 1100
     },
     {
-      "epoch": 2.85,
-      "grad_norm": 8.32421875,
-      "learning_rate": 5.732647814910026e-05,
-      "loss": 0.1881,
       "step": 1110
     },
     {
-      "epoch": 2.88,
-      "grad_norm": 0.3356345295906067,
-      "learning_rate": 5.604113110539846e-05,
-      "loss": 0.1638,
       "step": 1120
     },
     {
-      "epoch": 2.9,
-      "grad_norm": 2.0262017250061035,
-      "learning_rate": 5.475578406169666e-05,
-      "loss": 0.0901,
       "step": 1130
     },
     {
-      "epoch": 2.93,
-      "grad_norm": 5.13381290435791,
-      "learning_rate": 5.347043701799486e-05,
-      "loss": 0.1947,
       "step": 1140
     },
     {
-      "epoch": 2.96,
-      "grad_norm": 4.401228904724121,
-      "learning_rate": 5.218508997429307e-05,
-      "loss": 0.1105,
       "step": 1150
     },
     {
-      "epoch": 2.98,
-      "grad_norm": 3.711754083633423,
-      "learning_rate": 5.089974293059127e-05,
-      "loss": 0.082,
       "step": 1160
     },
     {
-      "epoch": 3.01,
-      "grad_norm": 0.4783603847026825,
-      "learning_rate": 4.961439588688946e-05,
-      "loss": 0.1223,
       "step": 1170
     },
     {
-      "epoch": 3.03,
-      "grad_norm": 6.101786136627197,
-      "learning_rate": 4.8329048843187664e-05,
-      "loss": 0.0386,
       "step": 1180
     },
     {
-      "epoch": 3.06,
-      "grad_norm": 0.09219735115766525,
-      "learning_rate": 4.7043701799485865e-05,
-      "loss": 0.046,
       "step": 1190
     },
     {
-      "epoch": 3.08,
-      "grad_norm": 0.09228511899709702,
-      "learning_rate": 4.5758354755784066e-05,
-      "loss": 0.0179,
       "step": 1200
     },
     {
-      "epoch": 3.11,
-      "grad_norm": 0.06705611199140549,
-      "learning_rate": 4.447300771208227e-05,
-      "loss": 0.0401,
       "step": 1210
     },
     {
-      "epoch": 3.14,
-      "grad_norm": 0.05702489614486694,
-      "learning_rate": 4.318766066838046e-05,
-      "loss": 0.0222,
       "step": 1220
     },
     {
-      "epoch": 3.16,
-      "grad_norm": 0.051934726536273956,
-      "learning_rate": 4.190231362467866e-05,
-      "loss": 0.0249,
       "step": 1230
     },
     {
-      "epoch": 3.19,
-      "grad_norm": 0.05382351949810982,
-      "learning_rate": 4.0616966580976864e-05,
-      "loss": 0.017,
       "step": 1240
     },
-    {
-      "epoch": 3.21,
-      "grad_norm": 0.10244094580411911,
-      "learning_rate": 3.9331619537275065e-05,
-      "loss": 0.1425,
-      "step": 1250
-    },
-    {
-      "epoch": 3.24,
-      "grad_norm": 0.04559057578444481,
-      "learning_rate": 3.8046272493573266e-05,
-      "loss": 0.0188,
-      "step": 1260
-    },
-    {
-      "epoch": 3.26,
-      "grad_norm": 1.9016327857971191,
-      "learning_rate": 3.676092544987147e-05,
-      "loss": 0.0196,
-      "step": 1270
-    },
-    {
-      "epoch": 3.29,
-      "grad_norm": 0.06497751176357269,
-      "learning_rate": 3.547557840616967e-05,
-      "loss": 0.0161,
-      "step": 1280
-    },
-    {
-      "epoch": 3.32,
-      "grad_norm": 0.05229075625538826,
-      "learning_rate": 3.419023136246787e-05,
-      "loss": 0.0165,
-      "step": 1290
-    },
-    {
-      "epoch": 3.34,
-      "grad_norm": 0.04599655419588089,
-      "learning_rate": 3.2904884318766064e-05,
-      "loss": 0.0338,
-      "step": 1300
-    },
-    {
-      "epoch": 3.37,
-      "grad_norm": 0.054148729890584946,
-      "learning_rate": 3.161953727506427e-05,
-      "loss": 0.0401,
-      "step": 1310
-    },
-    {
-      "epoch": 3.39,
-      "grad_norm": 0.135112926363945,
-      "learning_rate": 3.033419023136247e-05,
-      "loss": 0.0386,
-      "step": 1320
-    },
-    {
-      "epoch": 3.42,
-      "grad_norm": 0.05881468951702118,
-      "learning_rate": 2.9048843187660668e-05,
-      "loss": 0.0526,
-      "step": 1330
-    },
-    {
-      "epoch": 3.44,
-      "grad_norm": 0.11401781439781189,
-      "learning_rate": 2.7763496143958872e-05,
-      "loss": 0.0652,
-      "step": 1340
-    },
-    {
-      "epoch": 3.47,
-      "grad_norm": 0.6476575136184692,
-      "learning_rate": 2.647814910025707e-05,
-      "loss": 0.0772,
-      "step": 1350
-    },
-    {
-      "epoch": 3.5,
-      "grad_norm": 0.0521862767636776,
-      "learning_rate": 2.519280205655527e-05,
-      "loss": 0.0166,
-      "step": 1360
-    },
-    {
-      "epoch": 3.52,
-      "grad_norm": 0.05607061833143234,
-      "learning_rate": 2.3907455012853472e-05,
-      "loss": 0.0138,
-      "step": 1370
-    },
-    {
-      "epoch": 3.55,
-      "grad_norm": 0.05825699120759964,
-      "learning_rate": 2.262210796915167e-05,
-      "loss": 0.015,
-      "step": 1380
-    },
-    {
-      "epoch": 3.57,
-      "grad_norm": 4.6812334060668945,
-      "learning_rate": 2.133676092544987e-05,
-      "loss": 0.1053,
-      "step": 1390
-    },
-    {
-      "epoch": 3.6,
-      "grad_norm": 0.6198139786720276,
-      "learning_rate": 2.0051413881748076e-05,
-      "loss": 0.0178,
-      "step": 1400
-    },
-    {
-      "epoch": 3.62,
-      "grad_norm": 0.05886732041835785,
-      "learning_rate": 1.8766066838046273e-05,
-      "loss": 0.0278,
-      "step": 1410
-    },
-    {
-      "epoch": 3.65,
-      "grad_norm": 0.673959493637085,
-      "learning_rate": 1.7480719794344475e-05,
-      "loss": 0.0769,
-      "step": 1420
-    },
-    {
-      "epoch": 3.68,
-      "grad_norm": 0.31164562702178955,
-      "learning_rate": 1.6195372750642672e-05,
-      "loss": 0.1499,
-      "step": 1430
-    },
-    {
-      "epoch": 3.7,
-      "grad_norm": 0.2713916599750519,
-      "learning_rate": 1.4910025706940875e-05,
-      "loss": 0.0444,
-      "step": 1440
-    },
-    {
-      "epoch": 3.73,
-      "grad_norm": 2.0257036685943604,
-      "learning_rate": 1.3624678663239075e-05,
-      "loss": 0.071,
-      "step": 1450
-    },
-    {
-      "epoch": 3.75,
-      "grad_norm": 0.24306029081344604,
-      "learning_rate": 1.2339331619537276e-05,
-      "loss": 0.0114,
-      "step": 1460
-    },
-    {
-      "epoch": 3.78,
-      "grad_norm": 0.2247108817100525,
-      "learning_rate": 1.1053984575835475e-05,
-      "loss": 0.0132,
-      "step": 1470
-    },
-    {
-      "epoch": 3.8,
-      "grad_norm": 0.056268274784088135,
-      "learning_rate": 9.768637532133676e-06,
-      "loss": 0.0827,
-      "step": 1480
-    },
-    {
-      "epoch": 3.83,
-      "grad_norm": 0.04434029012918472,
-      "learning_rate": 8.483290488431877e-06,
-      "loss": 0.0138,
-      "step": 1490
-    },
-    {
-      "epoch": 3.86,
-      "grad_norm": 0.06419169157743454,
-      "learning_rate": 7.197943444730078e-06,
-      "loss": 0.077,
-      "step": 1500
-    },
-    {
-      "epoch": 3.88,
-      "grad_norm": 0.11697979271411896,
-      "learning_rate": 5.912596401028278e-06,
-      "loss": 0.0206,
-      "step": 1510
-    },
-    {
-      "epoch": 3.91,
-      "grad_norm": 1.25772225856781,
-      "learning_rate": 4.627249357326478e-06,
-      "loss": 0.02,
-      "step": 1520
-    },
-    {
-      "epoch": 3.93,
-      "grad_norm": 0.12491010874509811,
-      "learning_rate": 3.3419023136246787e-06,
-      "loss": 0.0257,
-      "step": 1530
-    },
-    {
-      "epoch": 3.96,
-      "grad_norm": 3.4478936195373535,
-      "learning_rate": 2.056555269922879e-06,
-      "loss": 0.0366,
-      "step": 1540
-    },
-    {
-      "epoch": 3.98,
-      "grad_norm": 0.0410100519657135,
-      "learning_rate": 7.712082262210797e-07,
-      "loss": 0.0525,
-      "step": 1550
-    },
     {
       "epoch": 4.0,
-      "step": 1556,
-      "total_flos": 9.63148132192297e+17,
-      "train_loss": 0.48303211886426173,
-      "train_runtime": 1041.8456,
-      "train_samples_per_second": 11.929,
-      "train_steps_per_second": 1.494
     }
   ],
   "logging_steps": 10,
-  "max_steps": 1556,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
   "save_steps": 1000,
-  "total_flos": 9.63148132192297e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.21430718898773193,
   "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-1000",
   "epoch": 4.0,
   "eval_steps": 1000,
+  "global_step": 1244,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.03,
+      "grad_norm": 10.849635124206543,
+      "learning_rate": 0.00019839228295819936,
+      "loss": 0.1258,
       "step": 10
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 29.82047462463379,
+      "learning_rate": 0.00019678456591639874,
+      "loss": 0.2394,
       "step": 20
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 4.707005500793457,
+      "learning_rate": 0.00019517684887459809,
+      "loss": 0.234,
       "step": 30
     },
     {
+      "epoch": 0.13,
+      "grad_norm": 13.66462516784668,
+      "learning_rate": 0.00019356913183279743,
+      "loss": 0.705,
       "step": 40
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 4.419419765472412,
+      "learning_rate": 0.00019196141479099678,
+      "loss": 0.6657,
       "step": 50
     },
     {
+      "epoch": 0.19,
+      "grad_norm": 0.14946621656417847,
+      "learning_rate": 0.00019035369774919616,
+      "loss": 0.2407,
       "step": 60
     },
     {
+      "epoch": 0.23,
+      "grad_norm": 2.8290460109710693,
+      "learning_rate": 0.0001887459807073955,
+      "loss": 0.3973,
       "step": 70
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 15.848897933959961,
+      "learning_rate": 0.00018713826366559486,
+      "loss": 0.2432,
       "step": 80
     },
     {
+      "epoch": 0.29,
+      "grad_norm": 0.30860471725463867,
+      "learning_rate": 0.0001855305466237942,
+      "loss": 0.2732,
       "step": 90
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 14.233210563659668,
+      "learning_rate": 0.0001839228295819936,
+      "loss": 0.261,
       "step": 100
     },
     {
+      "epoch": 0.35,
+      "grad_norm": 9.140750885009766,
+      "learning_rate": 0.00018231511254019294,
+      "loss": 0.1776,
       "step": 110
     },
     {
+      "epoch": 0.39,
+      "grad_norm": 0.9528696537017822,
+      "learning_rate": 0.00018070739549839229,
+      "loss": 0.3849,
       "step": 120
     },
     {
+      "epoch": 0.42,
+      "grad_norm": 21.716726303100586,
+      "learning_rate": 0.00017909967845659166,
+      "loss": 0.3328,
       "step": 130
     },
     {
+      "epoch": 0.45,
+      "grad_norm": 7.960571765899658,
+      "learning_rate": 0.000177491961414791,
+      "loss": 0.5052,
       "step": 140
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 3.9136505126953125,
+      "learning_rate": 0.00017588424437299036,
+      "loss": 0.5026,
       "step": 150
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 14.131813049316406,
+      "learning_rate": 0.0001742765273311897,
+      "loss": 0.3997,
       "step": 160
     },
     {
+      "epoch": 0.55,
+      "grad_norm": 13.529720306396484,
+      "learning_rate": 0.0001726688102893891,
+      "loss": 0.2877,
       "step": 170
     },
     {
+      "epoch": 0.58,
+      "grad_norm": 6.182504653930664,
+      "learning_rate": 0.00017106109324758844,
+      "loss": 0.3408,
       "step": 180
     },
     {
+      "epoch": 0.61,
+      "grad_norm": 0.17119653522968292,
+      "learning_rate": 0.0001694533762057878,
+      "loss": 0.2916,
       "step": 190
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 13.307029724121094,
+      "learning_rate": 0.00016784565916398716,
+      "loss": 0.3485,
       "step": 200
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 4.883426666259766,
+      "learning_rate": 0.0001662379421221865,
+      "loss": 0.3939,
       "step": 210
     },
     {
+      "epoch": 0.71,
+      "grad_norm": 5.17271614074707,
+      "learning_rate": 0.00016463022508038586,
+      "loss": 0.4001,
       "step": 220
     },
     {
+      "epoch": 0.74,
+      "grad_norm": 0.18887023627758026,
+      "learning_rate": 0.0001630225080385852,
+      "loss": 0.2459,
       "step": 230
     },
     {
+      "epoch": 0.77,
+      "grad_norm": 0.3397394120693207,
+      "learning_rate": 0.0001614147909967846,
+      "loss": 0.3813,
       "step": 240
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 7.221404075622559,
+      "learning_rate": 0.00015980707395498394,
+      "loss": 0.2913,
       "step": 250
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 3.0032007694244385,
+      "learning_rate": 0.0001581993569131833,
+      "loss": 0.273,
       "step": 260
     },
     {
+      "epoch": 0.87,
+      "grad_norm": 3.486640691757202,
+      "learning_rate": 0.00015659163987138264,
+      "loss": 0.5797,
       "step": 270
     },
     {
+      "epoch": 0.9,
+      "grad_norm": 0.3199945092201233,
+      "learning_rate": 0.00015498392282958201,
+      "loss": 0.4904,
       "step": 280
     },
     {
+      "epoch": 0.93,
+      "grad_norm": 38.1386833190918,
+      "learning_rate": 0.00015337620578778136,
+      "loss": 0.2789,
       "step": 290
     },
     {
+      "epoch": 0.96,
+      "grad_norm": 3.608177661895752,
+      "learning_rate": 0.0001517684887459807,
+      "loss": 0.5587,
       "step": 300
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 0.1488448977470398,
+      "learning_rate": 0.0001501607717041801,
+      "loss": 0.3405,
       "step": 310
     },
     {
+      "epoch": 1.03,
+      "grad_norm": 1.542035460472107,
+      "learning_rate": 0.00014855305466237944,
+      "loss": 0.2688,
       "step": 320
     },
     {
+      "epoch": 1.06,
+      "grad_norm": 2.1089909076690674,
+      "learning_rate": 0.0001469453376205788,
+      "loss": 0.2085,
       "step": 330
     },
     {
+      "epoch": 1.09,
+      "grad_norm": 11.16602897644043,
+      "learning_rate": 0.00014533762057877814,
+      "loss": 0.3437,
       "step": 340
     },
     {
+      "epoch": 1.13,
+      "grad_norm": 2.2596559524536133,
+      "learning_rate": 0.00014372990353697752,
+      "loss": 0.337,
       "step": 350
     },
     {
+      "epoch": 1.16,
+      "grad_norm": 2.616323947906494,
+      "learning_rate": 0.00014212218649517686,
+      "loss": 0.2074,
       "step": 360
     },
     {
+      "epoch": 1.19,
+      "grad_norm": 0.5269195437431335,
+      "learning_rate": 0.00014051446945337621,
+      "loss": 0.0913,
       "step": 370
     },
     {
+      "epoch": 1.22,
+      "grad_norm": 7.34785270690918,
+      "learning_rate": 0.0001389067524115756,
+      "loss": 0.1462,
       "step": 380
     },
     {
+      "epoch": 1.25,
+      "grad_norm": 0.13304546475410461,
+      "learning_rate": 0.00013729903536977494,
+      "loss": 0.0303,
       "step": 390
     },
     {
+      "epoch": 1.29,
+      "grad_norm": 0.42307400703430176,
+      "learning_rate": 0.0001356913183279743,
+      "loss": 0.2195,
       "step": 400
     },
     {
+      "epoch": 1.32,
+      "grad_norm": 0.16662320494651794,
+      "learning_rate": 0.00013408360128617364,
+      "loss": 0.2999,
       "step": 410
     },
     {
+      "epoch": 1.35,
+      "grad_norm": 35.579891204833984,
+      "learning_rate": 0.00013247588424437302,
+      "loss": 0.1194,
       "step": 420
     },
     {
+      "epoch": 1.38,
+      "grad_norm": 3.4818050861358643,
+      "learning_rate": 0.00013086816720257237,
+      "loss": 0.1469,
       "step": 430
     },
     {
+      "epoch": 1.41,
+      "grad_norm": 6.36860466003418,
+      "learning_rate": 0.00012926045016077172,
+      "loss": 0.2234,
       "step": 440
     },
     {
+      "epoch": 1.45,
+      "grad_norm": 7.359828948974609,
+      "learning_rate": 0.00012765273311897106,
+      "loss": 0.2114,
       "step": 450
     },
     {
+      "epoch": 1.48,
+      "grad_norm": 0.11759760975837708,
+      "learning_rate": 0.00012604501607717044,
+      "loss": 0.1059,
       "step": 460
     },
     {
+      "epoch": 1.51,
+      "grad_norm": 0.049188051372766495,
+      "learning_rate": 0.0001244372990353698,
+      "loss": 0.207,
       "step": 470
     },
     {
+      "epoch": 1.54,
+      "grad_norm": 0.06988845020532608,
+      "learning_rate": 0.00012282958199356914,
+      "loss": 0.1319,
       "step": 480
     },
     {
+      "epoch": 1.58,
+      "grad_norm": 10.857504844665527,
+      "learning_rate": 0.0001212218649517685,
+      "loss": 0.3497,
       "step": 490
     },
     {
+      "epoch": 1.61,
+      "grad_norm": 0.04112955555319786,
+      "learning_rate": 0.00011961414790996785,
+      "loss": 0.0711,
       "step": 500
     },
     {
+      "epoch": 1.64,
+      "grad_norm": 20.134990692138672,
+      "learning_rate": 0.0001180064308681672,
+      "loss": 0.2654,
       "step": 510
     },
     {
+      "epoch": 1.67,
+      "grad_norm": 0.03998303785920143,
+      "learning_rate": 0.00011639871382636655,
+      "loss": 0.0911,
       "step": 520
     },
     {
+      "epoch": 1.7,
+      "grad_norm": 10.199617385864258,
+      "learning_rate": 0.00011479099678456593,
+      "loss": 0.1106,
       "step": 530
     },
     {
+      "epoch": 1.74,
+      "grad_norm": 2.3347342014312744,
+      "learning_rate": 0.00011318327974276528,
+      "loss": 0.1948,
       "step": 540
     },
     {
+      "epoch": 1.77,
+      "grad_norm": 15.492130279541016,
+      "learning_rate": 0.00011157556270096463,
+      "loss": 0.2999,
       "step": 550
     },
     {
+      "epoch": 1.8,
+      "grad_norm": 16.2156982421875,
+      "learning_rate": 0.00010996784565916398,
+      "loss": 0.1792,
       "step": 560
     },
     {
+      "epoch": 1.83,
+      "grad_norm": 3.9076225757598877,
+      "learning_rate": 0.00010836012861736335,
+      "loss": 0.4599,
       "step": 570
     },
     {
+      "epoch": 1.86,
+      "grad_norm": 0.0662955567240715,
+      "learning_rate": 0.0001067524115755627,
+      "loss": 0.0834,
       "step": 580
     },
     {
+      "epoch": 1.9,
+      "grad_norm": 0.43734121322631836,
+      "learning_rate": 0.00010514469453376205,
+      "loss": 0.1804,
       "step": 590
     },
     {
+      "epoch": 1.93,
+      "grad_norm": 0.23478691279888153,
+      "learning_rate": 0.00010353697749196143,
+      "loss": 0.0831,
       "step": 600
     },
     {
+      "epoch": 1.96,
+      "grad_norm": 8.97579574584961,
+      "learning_rate": 0.00010192926045016078,
+      "loss": 0.2141,
       "step": 610
     },
     {
+      "epoch": 1.99,
+      "grad_norm": 5.947574615478516,
+      "learning_rate": 0.00010032154340836013,
+      "loss": 0.1059,
       "step": 620
     },
     {
+      "epoch": 2.03,
+      "grad_norm": 0.3693161904811859,
+      "learning_rate": 9.871382636655949e-05,
+      "loss": 0.0478,
       "step": 630
     },
     {
+      "epoch": 2.06,
+      "grad_norm": 0.33773139119148254,
+      "learning_rate": 9.710610932475884e-05,
+      "loss": 0.1512,
       "step": 640
     },
     {
+      "epoch": 2.09,
+      "grad_norm": 0.07303290069103241,
+      "learning_rate": 9.54983922829582e-05,
+      "loss": 0.0746,
       "step": 650
     },
     {
+      "epoch": 2.12,
+      "grad_norm": 0.021892189979553223,
+      "learning_rate": 9.389067524115757e-05,
+      "loss": 0.0071,
       "step": 660
     },
     {
+      "epoch": 2.15,
+      "grad_norm": 0.699686586856842,
+      "learning_rate": 9.228295819935692e-05,
+      "loss": 0.08,
       "step": 670
     },
     {
+      "epoch": 2.19,
+      "grad_norm": 1.7835339307785034,
+      "learning_rate": 9.067524115755628e-05,
+      "loss": 0.092,
       "step": 680
     },
     {
+      "epoch": 2.22,
+      "grad_norm": 0.025796858593821526,
+      "learning_rate": 8.906752411575563e-05,
+      "loss": 0.041,
       "step": 690
     },
     {
+      "epoch": 2.25,
+      "grad_norm": 11.788249969482422,
+      "learning_rate": 8.7459807073955e-05,
+      "loss": 0.0269,
       "step": 700
     },
     {
+      "epoch": 2.28,
+      "grad_norm": 6.836824893951416,
+      "learning_rate": 8.585209003215434e-05,
+      "loss": 0.2086,
       "step": 710
     },
     {
+      "epoch": 2.32,
+      "grad_norm": 0.02837471477687359,
+      "learning_rate": 8.42443729903537e-05,
+      "loss": 0.0828,
       "step": 720
     },
     {
+      "epoch": 2.35,
+      "grad_norm": 0.04012266919016838,
+      "learning_rate": 8.263665594855306e-05,
+      "loss": 0.0057,
       "step": 730
     },
     {
+      "epoch": 2.38,
+      "grad_norm": 0.05077001079916954,
+      "learning_rate": 8.102893890675242e-05,
+      "loss": 0.0116,
       "step": 740
     },
     {
+      "epoch": 2.41,
+      "grad_norm": 0.08000744879245758,
+      "learning_rate": 7.942122186495177e-05,
+      "loss": 0.0305,
       "step": 750
     },
     {
+      "epoch": 2.44,
+      "grad_norm": 0.03205496072769165,
+      "learning_rate": 7.781350482315113e-05,
+      "loss": 0.0451,
       "step": 760
     },
     {
+      "epoch": 2.48,
+      "grad_norm": 0.027969710528850555,
+      "learning_rate": 7.62057877813505e-05,
+      "loss": 0.0779,
       "step": 770
     },
     {
+      "epoch": 2.51,
+      "grad_norm": 3.297053098678589,
+      "learning_rate": 7.459807073954984e-05,
+      "loss": 0.028,
       "step": 780
     },
     {
+      "epoch": 2.54,
+      "grad_norm": 1.8469219207763672,
+      "learning_rate": 7.299035369774921e-05,
+      "loss": 0.1222,
       "step": 790
     },
     {
+      "epoch": 2.57,
+      "grad_norm": 0.5228595733642578,
+      "learning_rate": 7.138263665594856e-05,
+      "loss": 0.0087,
       "step": 800
     },
     {
+      "epoch": 2.6,
+      "grad_norm": 0.028694279491901398,
+      "learning_rate": 6.977491961414792e-05,
+      "loss": 0.0053,
       "step": 810
     },
     {
+      "epoch": 2.64,
+      "grad_norm": 0.026992863044142723,
+      "learning_rate": 6.816720257234727e-05,
+      "loss": 0.0065,
       "step": 820
     },
     {
+      "epoch": 2.67,
+      "grad_norm": 1.996466040611267,
+      "learning_rate": 6.655948553054663e-05,
+      "loss": 0.0955,
       "step": 830
     },
     {
+      "epoch": 2.7,
+      "grad_norm": 0.01983807235956192,
+      "learning_rate": 6.495176848874598e-05,
+      "loss": 0.1021,
       "step": 840
     },
     {
+      "epoch": 2.73,
+      "grad_norm": 0.03182640299201012,
+      "learning_rate": 6.334405144694535e-05,
+      "loss": 0.1796,
       "step": 850
     },
     {
+      "epoch": 2.77,
+      "grad_norm": 0.049088891595602036,
+      "learning_rate": 6.173633440514471e-05,
+      "loss": 0.0907,
       "step": 860
     },
     {
+      "epoch": 2.8,
+      "grad_norm": 0.11043746769428253,
+      "learning_rate": 6.012861736334405e-05,
+      "loss": 0.0627,
       "step": 870
     },
     {
+      "epoch": 2.83,
+      "grad_norm": 0.2206079512834549,
+      "learning_rate": 5.8520900321543414e-05,
+      "loss": 0.0412,
       "step": 880
     },
     {
+      "epoch": 2.86,
+      "grad_norm": 0.02966146729886532,
+      "learning_rate": 5.6913183279742764e-05,
+      "loss": 0.1015,
       "step": 890
     },
     {
+      "epoch": 2.89,
+      "grad_norm": 0.0345352403819561,
+      "learning_rate": 5.530546623794213e-05,
+      "loss": 0.0629,
       "step": 900
     },
     {
+      "epoch": 2.93,
+      "grad_norm": 0.06348275393247604,
+      "learning_rate": 5.369774919614148e-05,
+      "loss": 0.0064,
       "step": 910
     },
     {
+      "epoch": 2.96,
+      "grad_norm": 0.06559421122074127,
+      "learning_rate": 5.209003215434084e-05,
+      "loss": 0.0191,
       "step": 920
     },
     {
+      "epoch": 2.99,
+      "grad_norm": 1.113765835762024,
+      "learning_rate": 5.048231511254019e-05,
+      "loss": 0.0259,
       "step": 930
     },
     {
+      "epoch": 3.02,
+      "grad_norm": 0.02486424334347248,
+      "learning_rate": 4.887459807073955e-05,
+      "loss": 0.0049,
       "step": 940
     },
     {
+      "epoch": 3.05,
+      "grad_norm": 0.7845320701599121,
+      "learning_rate": 4.726688102893891e-05,
+      "loss": 0.0043,
       "step": 950
     },
     {
+      "epoch": 3.09,
+      "grad_norm": 0.021990863606333733,
+      "learning_rate": 4.5659163987138265e-05,
+      "loss": 0.0042,
       "step": 960
     },
     {
+      "epoch": 3.12,
+      "grad_norm": 0.022443994879722595,
+      "learning_rate": 4.405144694533762e-05,
+      "loss": 0.0054,
       "step": 970
     },
     {
+      "epoch": 3.15,
+      "grad_norm": 0.009742784313857555,
+      "learning_rate": 4.244372990353698e-05,
+      "loss": 0.0041,
       "step": 980
     },
     {
+      "epoch": 3.18,
+      "grad_norm": 0.037747763097286224,
+      "learning_rate": 4.083601286173634e-05,
+      "loss": 0.0242,
       "step": 990
     },
     {
+      "epoch": 3.22,
+      "grad_norm": 0.010466611944139004,
+      "learning_rate": 3.92282958199357e-05,
+      "loss": 0.0035,
       "step": 1000
     },
     {
+      "epoch": 3.22,
+      "eval_accuracy": 0.9485530546623794,
+      "eval_loss": 0.21430718898773193,
+      "eval_runtime": 14.2198,
+      "eval_samples_per_second": 43.742,
+      "eval_steps_per_second": 5.485,
       "step": 1000
     },
     {
+      "epoch": 3.25,
+      "grad_norm": 0.00991890113800764,
+      "learning_rate": 3.7620578778135054e-05,
+      "loss": 0.0039,
       "step": 1010
     },
     {
+      "epoch": 3.28,
+      "grad_norm": 0.016740955412387848,
+      "learning_rate": 3.601286173633441e-05,
+      "loss": 0.0065,
       "step": 1020
     },
     {
+      "epoch": 3.31,
+      "grad_norm": 0.03466745838522911,
+      "learning_rate": 3.4405144694533766e-05,
+      "loss": 0.099,
       "step": 1030
     },
     {
+      "epoch": 3.34,
+      "grad_norm": 0.008615425787866116,
+      "learning_rate": 3.279742765273312e-05,
+      "loss": 0.0179,
       "step": 1040
     },
     {
+      "epoch": 3.38,
+      "grad_norm": 0.05827281251549721,
+      "learning_rate": 3.118971061093248e-05,
+      "loss": 0.0041,
       "step": 1050
     },
     {
+      "epoch": 3.41,
+      "grad_norm": 0.0276072658598423,
+      "learning_rate": 2.9581993569131832e-05,
+      "loss": 0.0036,
       "step": 1060
     },
     {
+      "epoch": 3.44,
+      "grad_norm": 0.011412302032113075,
+      "learning_rate": 2.7974276527331188e-05,
+      "loss": 0.1013,
       "step": 1070
     },
     {
+      "epoch": 3.47,
+      "grad_norm": 0.013982011005282402,
+      "learning_rate": 2.6366559485530545e-05,
+      "loss": 0.0058,
       "step": 1080
     },
     {
+      "epoch": 3.5,
+      "grad_norm": 0.026057597249746323,
+      "learning_rate": 2.4758842443729904e-05,
+      "loss": 0.0077,
       "step": 1090
     },
     {
+      "epoch": 3.54,
+      "grad_norm": 0.04853319376707077,
+      "learning_rate": 2.315112540192926e-05,
+      "loss": 0.0035,
       "step": 1100
     },
     {
+      "epoch": 3.57,
+      "grad_norm": 0.013841088861227036,
+      "learning_rate": 2.154340836012862e-05,
+      "loss": 0.0208,
       "step": 1110
     },
     {
+      "epoch": 3.6,
+      "grad_norm": 0.03845496475696564,
+      "learning_rate": 1.9935691318327977e-05,
+      "loss": 0.0038,
       "step": 1120
     },
     {
+      "epoch": 3.63,
+      "grad_norm": 0.023922910913825035,
+      "learning_rate": 1.8327974276527333e-05,
+      "loss": 0.0032,
       "step": 1130
     },
     {
+      "epoch": 3.67,
+      "grad_norm": 0.014864934608340263,
+      "learning_rate": 1.672025723472669e-05,
+      "loss": 0.0028,
       "step": 1140
     },
     {
+      "epoch": 3.7,
+      "grad_norm": 0.05655550956726074,
+      "learning_rate": 1.5112540192926044e-05,
+      "loss": 0.0039,
       "step": 1150
     },
     {
+      "epoch": 3.73,
+      "grad_norm": 0.012573642656207085,
+      "learning_rate": 1.3504823151125404e-05,
+      "loss": 0.0028,
       "step": 1160
     },
     {
+      "epoch": 3.76,
+      "grad_norm": 0.022632773965597153,
+      "learning_rate": 1.189710610932476e-05,
+      "loss": 0.0033,
       "step": 1170
     },
     {
+      "epoch": 3.79,
+      "grad_norm": 0.01279931515455246,
+      "learning_rate": 1.0289389067524116e-05,
+      "loss": 0.0238,
       "step": 1180
     },
     {
+      "epoch": 3.83,
+      "grad_norm": 0.023662865161895752,
+      "learning_rate": 8.681672025723474e-06,
+      "loss": 0.0253,
       "step": 1190
     },
     {
+      "epoch": 3.86,
+      "grad_norm": 0.017510054633021355,
+      "learning_rate": 7.07395498392283e-06,
+      "loss": 0.0047,
       "step": 1200
     },
     {
+      "epoch": 3.89,
+      "grad_norm": 0.0257584135979414,
+      "learning_rate": 5.466237942122187e-06,
+      "loss": 0.004,
       "step": 1210
     },
     {
+      "epoch": 3.92,
+      "grad_norm": 0.3079407513141632,
+      "learning_rate": 3.858520900321544e-06,
+      "loss": 0.085,
       "step": 1220
     },
     {
+      "epoch": 3.95,
+      "grad_norm": 0.00990583747625351,
+      "learning_rate": 2.2508038585209006e-06,
+      "loss": 0.0029,
       "step": 1230
     },
     {
+      "epoch": 3.99,
+      "grad_norm": 0.011738813482224941,
+      "learning_rate": 6.430868167202573e-07,
+      "loss": 0.034,
       "step": 1240
     },
     {
       "epoch": 4.0,
+      "step": 1244,
+      "total_flos": 7.703325099767808e+17,
+      "train_loss": 0.1579143282675882,
+      "train_runtime": 491.3754,
+      "train_samples_per_second": 20.229,
+      "train_steps_per_second": 2.532
     }
   ],
   "logging_steps": 10,
+  "max_steps": 1244,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
   "save_steps": 1000,
+  "total_flos": 7.703325099767808e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null