🍻 cheers

Browse files

Files changed (6) hide show

README.md +6 -5
all_results.json +13 -0
eval_results.json +8 -0
runs/Oct07_16-01-53_efd0d9aa04b4/events.out.tfevents.1728319587.efd0d9aa04b4.3229.1 +3 -0
train_results.json +8 -0
trainer_state.json +1743 -0

README.md CHANGED Viewed

@@ -3,6 +3,7 @@ library_name: transformers
 license: apache-2.0
 base_model: google/vit-base-patch16-224-in21k
 tags:
 - generated_from_trainer
 datasets:
 - imagefolder
@@ -15,7 +16,7 @@ model-index:
       name: Image Classification
       type: image-classification
     dataset:
-      name: imagefolder
       type: imagefolder
       config: default
       split: train
@@ -23,7 +24,7 @@ model-index:
     metrics:
     - name: Accuracy
       type: accuracy
-      value: 0.9986902423051736
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -31,10 +32,10 @@ should probably proofread and complete it, then remove this comment. -->
 # finetuned-arsenic
-This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.0066
-- Accuracy: 0.9987
 ## Model description

 license: apache-2.0
 base_model: google/vit-base-patch16-224-in21k
 tags:
+- image-classification
 - generated_from_trainer
 datasets:
 - imagefolder
       name: Image Classification
       type: image-classification
     dataset:
+      name: indian_food_images
       type: imagefolder
       config: default
       split: train
     metrics:
     - name: Accuracy
       type: accuracy
+      value: 0.9993451211525868
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # finetuned-arsenic
+This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the indian_food_images dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.0048
+- Accuracy: 0.9993
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 4.0,
+    "eval_accuracy": 0.9993451211525868,
+    "eval_loss": 0.0047513521276414394,
+    "eval_runtime": 53.3656,
+    "eval_samples_per_second": 28.614,
+    "eval_steps_per_second": 3.579,
+    "total_flos": 2.6818427765818e+18,
+    "train_loss": 0.0841421499820822,
+    "train_runtime": 2597.595,
+    "train_samples_per_second": 13.323,
+    "train_steps_per_second": 0.833
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "eval_accuracy": 0.9993451211525868,
+    "eval_loss": 0.0047513521276414394,
+    "eval_runtime": 53.3656,
+    "eval_samples_per_second": 28.614,
+    "eval_steps_per_second": 3.579
+}

runs/Oct07_16-01-53_efd0d9aa04b4/events.out.tfevents.1728319587.efd0d9aa04b4.3229.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:981e50a825a3773a3d80bd9d1ea2d9197665fff9d6605b4505da276baf04d7d6
+size 411

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.0,
+    "total_flos": 2.6818427765818e+18,
+    "train_loss": 0.0841421499820822,
+    "train_runtime": 2597.595,
+    "train_samples_per_second": 13.323,
+    "train_steps_per_second": 0.833
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1743 @@

+{
+  "best_metric": 0.0047513521276414394,
+  "best_model_checkpoint": "finetuned-arsenic/checkpoint-2000",
+  "epoch": 4.0,
+  "eval_steps": 100,
+  "global_step": 2164,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.018484288354898338,
+      "grad_norm": 4.949392795562744,
+      "learning_rate": 0.0001990757855822551,
+      "loss": 0.5368,
+      "step": 10
+    },
+    {
+      "epoch": 0.036968576709796676,
+      "grad_norm": 3.3969953060150146,
+      "learning_rate": 0.00019815157116451017,
+      "loss": 0.3313,
+      "step": 20
+    },
+    {
+      "epoch": 0.05545286506469501,
+      "grad_norm": 0.859575629234314,
+      "learning_rate": 0.00019722735674676528,
+      "loss": 0.5003,
+      "step": 30
+    },
+    {
+      "epoch": 0.07393715341959335,
+      "grad_norm": 5.522923946380615,
+      "learning_rate": 0.00019630314232902034,
+      "loss": 0.2564,
+      "step": 40
+    },
+    {
+      "epoch": 0.09242144177449169,
+      "grad_norm": 4.462332248687744,
+      "learning_rate": 0.00019537892791127544,
+      "loss": 0.3339,
+      "step": 50
+    },
+    {
+      "epoch": 0.11090573012939002,
+      "grad_norm": 1.6224160194396973,
+      "learning_rate": 0.0001944547134935305,
+      "loss": 0.3965,
+      "step": 60
+    },
+    {
+      "epoch": 0.12939001848428835,
+      "grad_norm": 6.097796440124512,
+      "learning_rate": 0.0001935304990757856,
+      "loss": 0.3319,
+      "step": 70
+    },
+    {
+      "epoch": 0.1478743068391867,
+      "grad_norm": 3.9769697189331055,
+      "learning_rate": 0.00019260628465804066,
+      "loss": 0.4012,
+      "step": 80
+    },
+    {
+      "epoch": 0.16635859519408502,
+      "grad_norm": 2.335510730743408,
+      "learning_rate": 0.00019168207024029577,
+      "loss": 0.4584,
+      "step": 90
+    },
+    {
+      "epoch": 0.18484288354898337,
+      "grad_norm": 3.8701980113983154,
+      "learning_rate": 0.00019075785582255082,
+      "loss": 0.1855,
+      "step": 100
+    },
+    {
+      "epoch": 0.18484288354898337,
+      "eval_accuracy": 0.931237721021611,
+      "eval_loss": 0.1917603313922882,
+      "eval_runtime": 57.9367,
+      "eval_samples_per_second": 26.356,
+      "eval_steps_per_second": 3.297,
+      "step": 100
+    },
+    {
+      "epoch": 0.2033271719038817,
+      "grad_norm": 2.2155282497406006,
+      "learning_rate": 0.00018983364140480593,
+      "loss": 0.2331,
+      "step": 110
+    },
+    {
+      "epoch": 0.22181146025878004,
+      "grad_norm": 0.9634373188018799,
+      "learning_rate": 0.000188909426987061,
+      "loss": 0.209,
+      "step": 120
+    },
+    {
+      "epoch": 0.24029574861367836,
+      "grad_norm": 0.2715567648410797,
+      "learning_rate": 0.0001879852125693161,
+      "loss": 0.1486,
+      "step": 130
+    },
+    {
+      "epoch": 0.2587800369685767,
+      "grad_norm": 12.090089797973633,
+      "learning_rate": 0.00018706099815157118,
+      "loss": 0.1629,
+      "step": 140
+    },
+    {
+      "epoch": 0.27726432532347506,
+      "grad_norm": 1.551562786102295,
+      "learning_rate": 0.00018613678373382626,
+      "loss": 0.1852,
+      "step": 150
+    },
+    {
+      "epoch": 0.2957486136783734,
+      "grad_norm": 0.775977373123169,
+      "learning_rate": 0.00018521256931608134,
+      "loss": 0.3179,
+      "step": 160
+    },
+    {
+      "epoch": 0.3142329020332717,
+      "grad_norm": 3.0043396949768066,
+      "learning_rate": 0.00018428835489833642,
+      "loss": 0.3842,
+      "step": 170
+    },
+    {
+      "epoch": 0.33271719038817005,
+      "grad_norm": 1.2949095964431763,
+      "learning_rate": 0.0001833641404805915,
+      "loss": 0.2534,
+      "step": 180
+    },
+    {
+      "epoch": 0.3512014787430684,
+      "grad_norm": 9.545828819274902,
+      "learning_rate": 0.00018243992606284658,
+      "loss": 0.2031,
+      "step": 190
+    },
+    {
+      "epoch": 0.36968576709796674,
+      "grad_norm": 0.29387930035591125,
+      "learning_rate": 0.0001815157116451017,
+      "loss": 0.1792,
+      "step": 200
+    },
+    {
+      "epoch": 0.36968576709796674,
+      "eval_accuracy": 0.9364767518009168,
+      "eval_loss": 0.17399875819683075,
+      "eval_runtime": 52.9831,
+      "eval_samples_per_second": 28.821,
+      "eval_steps_per_second": 3.605,
+      "step": 200
+    },
+    {
+      "epoch": 0.38817005545286504,
+      "grad_norm": 2.138578414916992,
+      "learning_rate": 0.00018059149722735675,
+      "loss": 0.2129,
+      "step": 210
+    },
+    {
+      "epoch": 0.4066543438077634,
+      "grad_norm": 2.022083282470703,
+      "learning_rate": 0.00017966728280961186,
+      "loss": 0.1577,
+      "step": 220
+    },
+    {
+      "epoch": 0.42513863216266173,
+      "grad_norm": 2.8811872005462646,
+      "learning_rate": 0.0001787430683918669,
+      "loss": 0.21,
+      "step": 230
+    },
+    {
+      "epoch": 0.4436229205175601,
+      "grad_norm": 1.491790771484375,
+      "learning_rate": 0.00017781885397412202,
+      "loss": 0.2498,
+      "step": 240
+    },
+    {
+      "epoch": 0.46210720887245843,
+      "grad_norm": 2.5274643898010254,
+      "learning_rate": 0.00017689463955637707,
+      "loss": 0.149,
+      "step": 250
+    },
+    {
+      "epoch": 0.4805914972273567,
+      "grad_norm": 0.6268563270568848,
+      "learning_rate": 0.00017597042513863218,
+      "loss": 0.1306,
+      "step": 260
+    },
+    {
+      "epoch": 0.49907578558225507,
+      "grad_norm": 6.4418511390686035,
+      "learning_rate": 0.00017504621072088724,
+      "loss": 0.1889,
+      "step": 270
+    },
+    {
+      "epoch": 0.5175600739371534,
+      "grad_norm": 0.13176225125789642,
+      "learning_rate": 0.00017412199630314234,
+      "loss": 0.1304,
+      "step": 280
+    },
+    {
+      "epoch": 0.5360443622920518,
+      "grad_norm": 1.4023276567459106,
+      "learning_rate": 0.00017319778188539743,
+      "loss": 0.0872,
+      "step": 290
+    },
+    {
+      "epoch": 0.5545286506469501,
+      "grad_norm": 5.165181636810303,
+      "learning_rate": 0.0001722735674676525,
+      "loss": 0.1688,
+      "step": 300
+    },
+    {
+      "epoch": 0.5545286506469501,
+      "eval_accuracy": 0.9692206941715783,
+      "eval_loss": 0.078226737678051,
+      "eval_runtime": 52.9719,
+      "eval_samples_per_second": 28.827,
+      "eval_steps_per_second": 3.606,
+      "step": 300
+    },
+    {
+      "epoch": 0.5730129390018485,
+      "grad_norm": 4.743193626403809,
+      "learning_rate": 0.0001713493530499076,
+      "loss": 0.1222,
+      "step": 310
+    },
+    {
+      "epoch": 0.5914972273567468,
+      "grad_norm": 3.3770973682403564,
+      "learning_rate": 0.00017042513863216267,
+      "loss": 0.2799,
+      "step": 320
+    },
+    {
+      "epoch": 0.609981515711645,
+      "grad_norm": 1.9085370302200317,
+      "learning_rate": 0.00016950092421441775,
+      "loss": 0.1779,
+      "step": 330
+    },
+    {
+      "epoch": 0.6284658040665434,
+      "grad_norm": 2.592458963394165,
+      "learning_rate": 0.00016857670979667283,
+      "loss": 0.1619,
+      "step": 340
+    },
+    {
+      "epoch": 0.6469500924214417,
+      "grad_norm": 1.1735055446624756,
+      "learning_rate": 0.00016765249537892791,
+      "loss": 0.4249,
+      "step": 350
+    },
+    {
+      "epoch": 0.6654343807763401,
+      "grad_norm": 3.8289904594421387,
+      "learning_rate": 0.000166728280961183,
+      "loss": 0.1009,
+      "step": 360
+    },
+    {
+      "epoch": 0.6839186691312384,
+      "grad_norm": 2.531283378601074,
+      "learning_rate": 0.00016580406654343808,
+      "loss": 0.1494,
+      "step": 370
+    },
+    {
+      "epoch": 0.7024029574861368,
+      "grad_norm": 0.21572425961494446,
+      "learning_rate": 0.00016487985212569316,
+      "loss": 0.0824,
+      "step": 380
+    },
+    {
+      "epoch": 0.7208872458410351,
+      "grad_norm": 3.6041758060455322,
+      "learning_rate": 0.00016395563770794827,
+      "loss": 0.1145,
+      "step": 390
+    },
+    {
+      "epoch": 0.7393715341959335,
+      "grad_norm": 0.6018674969673157,
+      "learning_rate": 0.00016303142329020332,
+      "loss": 0.1238,
+      "step": 400
+    },
+    {
+      "epoch": 0.7393715341959335,
+      "eval_accuracy": 0.922724296005239,
+      "eval_loss": 0.21575002372264862,
+      "eval_runtime": 52.6224,
+      "eval_samples_per_second": 29.018,
+      "eval_steps_per_second": 3.63,
+      "step": 400
+    },
+    {
+      "epoch": 0.7578558225508318,
+      "grad_norm": 0.25093191862106323,
+      "learning_rate": 0.00016210720887245843,
+      "loss": 0.0724,
+      "step": 410
+    },
+    {
+      "epoch": 0.7763401109057301,
+      "grad_norm": 0.2480381280183792,
+      "learning_rate": 0.00016118299445471348,
+      "loss": 0.106,
+      "step": 420
+    },
+    {
+      "epoch": 0.7948243992606284,
+      "grad_norm": 8.212138175964355,
+      "learning_rate": 0.0001602587800369686,
+      "loss": 0.1665,
+      "step": 430
+    },
+    {
+      "epoch": 0.8133086876155268,
+      "grad_norm": 0.6615661382675171,
+      "learning_rate": 0.00015933456561922367,
+      "loss": 0.0547,
+      "step": 440
+    },
+    {
+      "epoch": 0.8317929759704251,
+      "grad_norm": 4.98212194442749,
+      "learning_rate": 0.00015841035120147876,
+      "loss": 0.1982,
+      "step": 450
+    },
+    {
+      "epoch": 0.8502772643253235,
+      "grad_norm": 1.7662006616592407,
+      "learning_rate": 0.00015748613678373384,
+      "loss": 0.1402,
+      "step": 460
+    },
+    {
+      "epoch": 0.8687615526802218,
+      "grad_norm": 5.664543151855469,
+      "learning_rate": 0.00015656192236598892,
+      "loss": 0.1606,
+      "step": 470
+    },
+    {
+      "epoch": 0.8872458410351202,
+      "grad_norm": 5.662344932556152,
+      "learning_rate": 0.000155637707948244,
+      "loss": 0.0869,
+      "step": 480
+    },
+    {
+      "epoch": 0.9057301293900185,
+      "grad_norm": 1.1777679920196533,
+      "learning_rate": 0.00015471349353049908,
+      "loss": 0.0827,
+      "step": 490
+    },
+    {
+      "epoch": 0.9242144177449169,
+      "grad_norm": 0.06051797419786453,
+      "learning_rate": 0.00015378927911275416,
+      "loss": 0.0969,
+      "step": 500
+    },
+    {
+      "epoch": 0.9242144177449169,
+      "eval_accuracy": 0.9842829076620825,
+      "eval_loss": 0.04485374689102173,
+      "eval_runtime": 52.5355,
+      "eval_samples_per_second": 29.066,
+      "eval_steps_per_second": 3.636,
+      "step": 500
+    },
+    {
+      "epoch": 0.9426987060998152,
+      "grad_norm": 9.434717178344727,
+      "learning_rate": 0.00015286506469500925,
+      "loss": 0.1921,
+      "step": 510
+    },
+    {
+      "epoch": 0.9611829944547134,
+      "grad_norm": 1.619040846824646,
+      "learning_rate": 0.00015194085027726433,
+      "loss": 0.1906,
+      "step": 520
+    },
+    {
+      "epoch": 0.9796672828096118,
+      "grad_norm": 0.5532277226448059,
+      "learning_rate": 0.0001510166358595194,
+      "loss": 0.1082,
+      "step": 530
+    },
+    {
+      "epoch": 0.9981515711645101,
+      "grad_norm": 0.0866900086402893,
+      "learning_rate": 0.0001500924214417745,
+      "loss": 0.1119,
+      "step": 540
+    },
+    {
+      "epoch": 1.0166358595194085,
+      "grad_norm": 2.668076276779175,
+      "learning_rate": 0.00014916820702402957,
+      "loss": 0.143,
+      "step": 550
+    },
+    {
+      "epoch": 1.0351201478743068,
+      "grad_norm": 0.15896956622600555,
+      "learning_rate": 0.00014824399260628468,
+      "loss": 0.0378,
+      "step": 560
+    },
+    {
+      "epoch": 1.0536044362292052,
+      "grad_norm": 0.12053361535072327,
+      "learning_rate": 0.00014731977818853976,
+      "loss": 0.0528,
+      "step": 570
+    },
+    {
+      "epoch": 1.0720887245841035,
+      "grad_norm": 0.06896385550498962,
+      "learning_rate": 0.00014639556377079484,
+      "loss": 0.1663,
+      "step": 580
+    },
+    {
+      "epoch": 1.0905730129390019,
+      "grad_norm": 7.400400638580322,
+      "learning_rate": 0.00014547134935304992,
+      "loss": 0.081,
+      "step": 590
+    },
+    {
+      "epoch": 1.1090573012939002,
+      "grad_norm": 0.04029673710465431,
+      "learning_rate": 0.000144547134935305,
+      "loss": 0.0326,
+      "step": 600
+    },
+    {
+      "epoch": 1.1090573012939002,
+      "eval_accuracy": 0.9574328749181401,
+      "eval_loss": 0.1554253250360489,
+      "eval_runtime": 52.4665,
+      "eval_samples_per_second": 29.104,
+      "eval_steps_per_second": 3.64,
+      "step": 600
+    },
+    {
+      "epoch": 1.1275415896487986,
+      "grad_norm": 1.2735309600830078,
+      "learning_rate": 0.0001436229205175601,
+      "loss": 0.1339,
+      "step": 610
+    },
+    {
+      "epoch": 1.146025878003697,
+      "grad_norm": 2.2266452312469482,
+      "learning_rate": 0.00014269870609981517,
+      "loss": 0.1443,
+      "step": 620
+    },
+    {
+      "epoch": 1.1645101663585953,
+      "grad_norm": 2.932450294494629,
+      "learning_rate": 0.00014177449168207025,
+      "loss": 0.0869,
+      "step": 630
+    },
+    {
+      "epoch": 1.1829944547134936,
+      "grad_norm": 5.688024520874023,
+      "learning_rate": 0.00014085027726432533,
+      "loss": 0.091,
+      "step": 640
+    },
+    {
+      "epoch": 1.201478743068392,
+      "grad_norm": 0.04643339663743973,
+      "learning_rate": 0.0001399260628465804,
+      "loss": 0.0433,
+      "step": 650
+    },
+    {
+      "epoch": 1.21996303142329,
+      "grad_norm": 0.38614460825920105,
+      "learning_rate": 0.0001390018484288355,
+      "loss": 0.0514,
+      "step": 660
+    },
+    {
+      "epoch": 1.2384473197781884,
+      "grad_norm": 0.03372357785701752,
+      "learning_rate": 0.00013807763401109058,
+      "loss": 0.0826,
+      "step": 670
+    },
+    {
+      "epoch": 1.2569316081330868,
+      "grad_norm": 0.7059990763664246,
+      "learning_rate": 0.00013715341959334566,
+      "loss": 0.1309,
+      "step": 680
+    },
+    {
+      "epoch": 1.2754158964879851,
+      "grad_norm": 1.5385607481002808,
+      "learning_rate": 0.00013622920517560074,
+      "loss": 0.115,
+      "step": 690
+    },
+    {
+      "epoch": 1.2939001848428835,
+      "grad_norm": 1.647644281387329,
+      "learning_rate": 0.00013530499075785582,
+      "loss": 0.1057,
+      "step": 700
+    },
+    {
+      "epoch": 1.2939001848428835,
+      "eval_accuracy": 0.9738048461034708,
+      "eval_loss": 0.08448445796966553,
+      "eval_runtime": 52.7705,
+      "eval_samples_per_second": 28.937,
+      "eval_steps_per_second": 3.619,
+      "step": 700
+    },
+    {
+      "epoch": 1.3123844731977818,
+      "grad_norm": 0.8896564841270447,
+      "learning_rate": 0.0001343807763401109,
+      "loss": 0.1076,
+      "step": 710
+    },
+    {
+      "epoch": 1.3308687615526802,
+      "grad_norm": 0.9722292423248291,
+      "learning_rate": 0.000133456561922366,
+      "loss": 0.1285,
+      "step": 720
+    },
+    {
+      "epoch": 1.3493530499075785,
+      "grad_norm": 3.9030041694641113,
+      "learning_rate": 0.00013253234750462106,
+      "loss": 0.1367,
+      "step": 730
+    },
+    {
+      "epoch": 1.3678373382624769,
+      "grad_norm": 1.199768304824829,
+      "learning_rate": 0.00013160813308687617,
+      "loss": 0.088,
+      "step": 740
+    },
+    {
+      "epoch": 1.3863216266173752,
+      "grad_norm": 0.8339413404464722,
+      "learning_rate": 0.00013068391866913125,
+      "loss": 0.0481,
+      "step": 750
+    },
+    {
+      "epoch": 1.4048059149722736,
+      "grad_norm": 2.3673453330993652,
+      "learning_rate": 0.00012975970425138634,
+      "loss": 0.0698,
+      "step": 760
+    },
+    {
+      "epoch": 1.423290203327172,
+      "grad_norm": 0.042785417288541794,
+      "learning_rate": 0.00012883548983364142,
+      "loss": 0.0179,
+      "step": 770
+    },
+    {
+      "epoch": 1.4417744916820703,
+      "grad_norm": 2.720048189163208,
+      "learning_rate": 0.0001279112754158965,
+      "loss": 0.0996,
+      "step": 780
+    },
+    {
+      "epoch": 1.4602587800369686,
+      "grad_norm": 16.840740203857422,
+      "learning_rate": 0.00012698706099815158,
+      "loss": 0.0707,
+      "step": 790
+    },
+    {
+      "epoch": 1.478743068391867,
+      "grad_norm": 0.1579107642173767,
+      "learning_rate": 0.00012606284658040666,
+      "loss": 0.0805,
+      "step": 800
+    },
+    {
+      "epoch": 1.478743068391867,
+      "eval_accuracy": 0.9823182711198428,
+      "eval_loss": 0.07117750495672226,
+      "eval_runtime": 53.0346,
+      "eval_samples_per_second": 28.793,
+      "eval_steps_per_second": 3.601,
+      "step": 800
+    },
+    {
+      "epoch": 1.4972273567467653,
+      "grad_norm": 7.252885341644287,
+      "learning_rate": 0.00012513863216266174,
+      "loss": 0.0848,
+      "step": 810
+    },
+    {
+      "epoch": 1.5157116451016637,
+      "grad_norm": 0.25338369607925415,
+      "learning_rate": 0.00012421441774491682,
+      "loss": 0.0689,
+      "step": 820
+    },
+    {
+      "epoch": 1.5341959334565618,
+      "grad_norm": 3.66860032081604,
+      "learning_rate": 0.0001232902033271719,
+      "loss": 0.041,
+      "step": 830
+    },
+    {
+      "epoch": 1.5526802218114604,
+      "grad_norm": 9.176445960998535,
+      "learning_rate": 0.000122365988909427,
+      "loss": 0.111,
+      "step": 840
+    },
+    {
+      "epoch": 1.5711645101663585,
+      "grad_norm": 0.032652150839567184,
+      "learning_rate": 0.00012144177449168208,
+      "loss": 0.0519,
+      "step": 850
+    },
+    {
+      "epoch": 1.589648798521257,
+      "grad_norm": 0.054165273904800415,
+      "learning_rate": 0.00012051756007393715,
+      "loss": 0.0661,
+      "step": 860
+    },
+    {
+      "epoch": 1.6081330868761552,
+      "grad_norm": 0.10612482577562332,
+      "learning_rate": 0.00011959334565619225,
+      "loss": 0.0157,
+      "step": 870
+    },
+    {
+      "epoch": 1.6266173752310538,
+      "grad_norm": 0.7138892412185669,
+      "learning_rate": 0.00011866913123844731,
+      "loss": 0.1159,
+      "step": 880
+    },
+    {
+      "epoch": 1.645101663585952,
+      "grad_norm": 0.0576617456972599,
+      "learning_rate": 0.00011774491682070241,
+      "loss": 0.1059,
+      "step": 890
+    },
+    {
+      "epoch": 1.6635859519408502,
+      "grad_norm": 2.485743999481201,
+      "learning_rate": 0.00011682070240295748,
+      "loss": 0.0889,
+      "step": 900
+    },
+    {
+      "epoch": 1.6635859519408502,
+      "eval_accuracy": 0.9796987557301899,
+      "eval_loss": 0.07181376963853836,
+      "eval_runtime": 53.7952,
+      "eval_samples_per_second": 28.385,
+      "eval_steps_per_second": 3.551,
+      "step": 900
+    },
+    {
+      "epoch": 1.6820702402957486,
+      "grad_norm": 0.25389525294303894,
+      "learning_rate": 0.00011589648798521257,
+      "loss": 0.0478,
+      "step": 910
+    },
+    {
+      "epoch": 1.700554528650647,
+      "grad_norm": 0.040639039129018784,
+      "learning_rate": 0.00011497227356746765,
+      "loss": 0.0579,
+      "step": 920
+    },
+    {
+      "epoch": 1.7190388170055453,
+      "grad_norm": 0.04252118989825249,
+      "learning_rate": 0.00011404805914972275,
+      "loss": 0.0414,
+      "step": 930
+    },
+    {
+      "epoch": 1.7375231053604436,
+      "grad_norm": 0.03039310872554779,
+      "learning_rate": 0.00011312384473197783,
+      "loss": 0.1247,
+      "step": 940
+    },
+    {
+      "epoch": 1.756007393715342,
+      "grad_norm": 0.04092634469270706,
+      "learning_rate": 0.00011219963031423291,
+      "loss": 0.0485,
+      "step": 950
+    },
+    {
+      "epoch": 1.7744916820702403,
+      "grad_norm": 0.02784869633615017,
+      "learning_rate": 0.000111275415896488,
+      "loss": 0.044,
+      "step": 960
+    },
+    {
+      "epoch": 1.7929759704251387,
+      "grad_norm": 0.6377788186073303,
+      "learning_rate": 0.00011035120147874307,
+      "loss": 0.0833,
+      "step": 970
+    },
+    {
+      "epoch": 1.8114602587800368,
+      "grad_norm": 0.0410403273999691,
+      "learning_rate": 0.00010942698706099817,
+      "loss": 0.0079,
+      "step": 980
+    },
+    {
+      "epoch": 1.8299445471349354,
+      "grad_norm": 0.16617639362812042,
+      "learning_rate": 0.00010850277264325324,
+      "loss": 0.0562,
+      "step": 990
+    },
+    {
+      "epoch": 1.8484288354898335,
+      "grad_norm": 6.131214141845703,
+      "learning_rate": 0.00010757855822550833,
+      "loss": 0.0503,
+      "step": 1000
+    },
+    {
+      "epoch": 1.8484288354898335,
+      "eval_accuracy": 0.9934512115258677,
+      "eval_loss": 0.0250676441937685,
+      "eval_runtime": 53.2731,
+      "eval_samples_per_second": 28.664,
+      "eval_steps_per_second": 3.585,
+      "step": 1000
+    },
+    {
+      "epoch": 1.866913123844732,
+      "grad_norm": 0.07335863262414932,
+      "learning_rate": 0.0001066543438077634,
+      "loss": 0.0444,
+      "step": 1010
+    },
+    {
+      "epoch": 1.8853974121996302,
+      "grad_norm": 0.034475117921829224,
+      "learning_rate": 0.0001057301293900185,
+      "loss": 0.0513,
+      "step": 1020
+    },
+    {
+      "epoch": 1.9038817005545288,
+      "grad_norm": 0.035967420786619186,
+      "learning_rate": 0.00010480591497227356,
+      "loss": 0.0669,
+      "step": 1030
+    },
+    {
+      "epoch": 1.922365988909427,
+      "grad_norm": 0.029034554958343506,
+      "learning_rate": 0.00010388170055452866,
+      "loss": 0.0278,
+      "step": 1040
+    },
+    {
+      "epoch": 1.9408502772643255,
+      "grad_norm": 3.698307514190674,
+      "learning_rate": 0.00010295748613678373,
+      "loss": 0.0547,
+      "step": 1050
+    },
+    {
+      "epoch": 1.9593345656192236,
+      "grad_norm": 0.040026549249887466,
+      "learning_rate": 0.00010203327171903882,
+      "loss": 0.0065,
+      "step": 1060
+    },
+    {
+      "epoch": 1.9778188539741222,
+      "grad_norm": 3.3067240715026855,
+      "learning_rate": 0.0001011090573012939,
+      "loss": 0.0828,
+      "step": 1070
+    },
+    {
+      "epoch": 1.9963031423290203,
+      "grad_norm": 0.05000556632876396,
+      "learning_rate": 0.000100184842883549,
+      "loss": 0.0632,
+      "step": 1080
+    },
+    {
+      "epoch": 2.014787430683919,
+      "grad_norm": 0.04542790353298187,
+      "learning_rate": 9.926062846580408e-05,
+      "loss": 0.0682,
+      "step": 1090
+    },
+    {
+      "epoch": 2.033271719038817,
+      "grad_norm": 0.030154038220643997,
+      "learning_rate": 9.833641404805916e-05,
+      "loss": 0.0225,
+      "step": 1100
+    },
+    {
+      "epoch": 2.033271719038817,
+      "eval_accuracy": 0.9967256057629339,
+      "eval_loss": 0.01773611083626747,
+      "eval_runtime": 52.5689,
+      "eval_samples_per_second": 29.048,
+      "eval_steps_per_second": 3.633,
+      "step": 1100
+    },
+    {
+      "epoch": 2.0517560073937156,
+      "grad_norm": 0.3824068307876587,
+      "learning_rate": 9.741219963031424e-05,
+      "loss": 0.0194,
+      "step": 1110
+    },
+    {
+      "epoch": 2.0702402957486137,
+      "grad_norm": 0.020000776275992393,
+      "learning_rate": 9.648798521256932e-05,
+      "loss": 0.0259,
+      "step": 1120
+    },
+    {
+      "epoch": 2.088724584103512,
+      "grad_norm": 3.488415241241455,
+      "learning_rate": 9.55637707948244e-05,
+      "loss": 0.0629,
+      "step": 1130
+    },
+    {
+      "epoch": 2.1072088724584104,
+      "grad_norm": 10.373331069946289,
+      "learning_rate": 9.463955637707949e-05,
+      "loss": 0.015,
+      "step": 1140
+    },
+    {
+      "epoch": 2.1256931608133085,
+      "grad_norm": 0.23100066184997559,
+      "learning_rate": 9.371534195933457e-05,
+      "loss": 0.0619,
+      "step": 1150
+    },
+    {
+      "epoch": 2.144177449168207,
+      "grad_norm": 0.07692666351795197,
+      "learning_rate": 9.279112754158965e-05,
+      "loss": 0.06,
+      "step": 1160
+    },
+    {
+      "epoch": 2.162661737523105,
+      "grad_norm": 0.057554759085178375,
+      "learning_rate": 9.186691312384473e-05,
+      "loss": 0.0079,
+      "step": 1170
+    },
+    {
+      "epoch": 2.1811460258780038,
+      "grad_norm": 0.039722565561532974,
+      "learning_rate": 9.094269870609981e-05,
+      "loss": 0.0581,
+      "step": 1180
+    },
+    {
+      "epoch": 2.199630314232902,
+      "grad_norm": 0.021510232239961624,
+      "learning_rate": 9.001848428835489e-05,
+      "loss": 0.0052,
+      "step": 1190
+    },
+    {
+      "epoch": 2.2181146025878005,
+      "grad_norm": 0.019746674224734306,
+      "learning_rate": 8.909426987060999e-05,
+      "loss": 0.0049,
+      "step": 1200
+    },
+    {
+      "epoch": 2.2181146025878005,
+      "eval_accuracy": 0.9921414538310412,
+      "eval_loss": 0.024552814662456512,
+      "eval_runtime": 52.686,
+      "eval_samples_per_second": 28.983,
+      "eval_steps_per_second": 3.625,
+      "step": 1200
+    },
+    {
+      "epoch": 2.2365988909426986,
+      "grad_norm": 4.809552192687988,
+      "learning_rate": 8.817005545286507e-05,
+      "loss": 0.098,
+      "step": 1210
+    },
+    {
+      "epoch": 2.255083179297597,
+      "grad_norm": 0.22049099206924438,
+      "learning_rate": 8.724584103512015e-05,
+      "loss": 0.1328,
+      "step": 1220
+    },
+    {
+      "epoch": 2.2735674676524953,
+      "grad_norm": 0.02430686727166176,
+      "learning_rate": 8.632162661737525e-05,
+      "loss": 0.0332,
+      "step": 1230
+    },
+    {
+      "epoch": 2.292051756007394,
+      "grad_norm": 0.16566839814186096,
+      "learning_rate": 8.539741219963033e-05,
+      "loss": 0.0242,
+      "step": 1240
+    },
+    {
+      "epoch": 2.310536044362292,
+      "grad_norm": 0.07895852625370026,
+      "learning_rate": 8.447319778188541e-05,
+      "loss": 0.0394,
+      "step": 1250
+    },
+    {
+      "epoch": 2.3290203327171906,
+      "grad_norm": 0.01941494271159172,
+      "learning_rate": 8.354898336414049e-05,
+      "loss": 0.0373,
+      "step": 1260
+    },
+    {
+      "epoch": 2.3475046210720887,
+      "grad_norm": 0.018574291840195656,
+      "learning_rate": 8.262476894639557e-05,
+      "loss": 0.0582,
+      "step": 1270
+    },
+    {
+      "epoch": 2.3659889094269873,
+      "grad_norm": 9.006904602050781,
+      "learning_rate": 8.170055452865065e-05,
+      "loss": 0.075,
+      "step": 1280
+    },
+    {
+      "epoch": 2.3844731977818854,
+      "grad_norm": 0.5771515965461731,
+      "learning_rate": 8.077634011090573e-05,
+      "loss": 0.0217,
+      "step": 1290
+    },
+    {
+      "epoch": 2.402957486136784,
+      "grad_norm": 0.01840708591043949,
+      "learning_rate": 7.985212569316082e-05,
+      "loss": 0.0152,
+      "step": 1300
+    },
+    {
+      "epoch": 2.402957486136784,
+      "eval_accuracy": 0.9986902423051736,
+      "eval_loss": 0.008291647769510746,
+      "eval_runtime": 53.4499,
+      "eval_samples_per_second": 28.569,
+      "eval_steps_per_second": 3.573,
+      "step": 1300
+    },
+    {
+      "epoch": 2.421441774491682,
+      "grad_norm": 0.017435792833566666,
+      "learning_rate": 7.89279112754159e-05,
+      "loss": 0.0448,
+      "step": 1310
+    },
+    {
+      "epoch": 2.43992606284658,
+      "grad_norm": 0.7729086875915527,
+      "learning_rate": 7.800369685767098e-05,
+      "loss": 0.0444,
+      "step": 1320
+    },
+    {
+      "epoch": 2.4584103512014788,
+      "grad_norm": 0.059264715760946274,
+      "learning_rate": 7.707948243992606e-05,
+      "loss": 0.0397,
+      "step": 1330
+    },
+    {
+      "epoch": 2.476894639556377,
+      "grad_norm": 0.024057278409600258,
+      "learning_rate": 7.615526802218114e-05,
+      "loss": 0.028,
+      "step": 1340
+    },
+    {
+      "epoch": 2.4953789279112755,
+      "grad_norm": 0.022951899096369743,
+      "learning_rate": 7.523105360443624e-05,
+      "loss": 0.0444,
+      "step": 1350
+    },
+    {
+      "epoch": 2.5138632162661736,
+      "grad_norm": 0.021782563999295235,
+      "learning_rate": 7.430683918669132e-05,
+      "loss": 0.0385,
+      "step": 1360
+    },
+    {
+      "epoch": 2.532347504621072,
+      "grad_norm": 0.1371038258075714,
+      "learning_rate": 7.33826247689464e-05,
+      "loss": 0.0188,
+      "step": 1370
+    },
+    {
+      "epoch": 2.5508317929759703,
+      "grad_norm": 0.7299683690071106,
+      "learning_rate": 7.245841035120148e-05,
+      "loss": 0.0845,
+      "step": 1380
+    },
+    {
+      "epoch": 2.569316081330869,
+      "grad_norm": 0.34656259417533875,
+      "learning_rate": 7.153419593345656e-05,
+      "loss": 0.0436,
+      "step": 1390
+    },
+    {
+      "epoch": 2.587800369685767,
+      "grad_norm": 0.10165718197822571,
+      "learning_rate": 7.060998151571166e-05,
+      "loss": 0.08,
+      "step": 1400
+    },
+    {
+      "epoch": 2.587800369685767,
+      "eval_accuracy": 0.9941060903732809,
+      "eval_loss": 0.021378275007009506,
+      "eval_runtime": 52.8132,
+      "eval_samples_per_second": 28.913,
+      "eval_steps_per_second": 3.617,
+      "step": 1400
+    },
+    {
+      "epoch": 2.6062846580406656,
+      "grad_norm": 5.586907863616943,
+      "learning_rate": 6.968576709796674e-05,
+      "loss": 0.0295,
+      "step": 1410
+    },
+    {
+      "epoch": 2.6247689463955637,
+      "grad_norm": 0.0221896730363369,
+      "learning_rate": 6.876155268022182e-05,
+      "loss": 0.0627,
+      "step": 1420
+    },
+    {
+      "epoch": 2.6432532347504623,
+      "grad_norm": 0.30416977405548096,
+      "learning_rate": 6.78373382624769e-05,
+      "loss": 0.0035,
+      "step": 1430
+    },
+    {
+      "epoch": 2.6617375231053604,
+      "grad_norm": 0.102454274892807,
+      "learning_rate": 6.691312384473198e-05,
+      "loss": 0.0641,
+      "step": 1440
+    },
+    {
+      "epoch": 2.6802218114602585,
+      "grad_norm": 0.023131974041461945,
+      "learning_rate": 6.598890942698706e-05,
+      "loss": 0.0326,
+      "step": 1450
+    },
+    {
+      "epoch": 2.698706099815157,
+      "grad_norm": 0.09067076444625854,
+      "learning_rate": 6.506469500924215e-05,
+      "loss": 0.017,
+      "step": 1460
+    },
+    {
+      "epoch": 2.7171903881700556,
+      "grad_norm": 3.3906850814819336,
+      "learning_rate": 6.414048059149723e-05,
+      "loss": 0.029,
+      "step": 1470
+    },
+    {
+      "epoch": 2.7356746765249538,
+      "grad_norm": 0.061337146908044815,
+      "learning_rate": 6.321626617375231e-05,
+      "loss": 0.0168,
+      "step": 1480
+    },
+    {
+      "epoch": 2.754158964879852,
+      "grad_norm": 0.19621238112449646,
+      "learning_rate": 6.229205175600739e-05,
+      "loss": 0.006,
+      "step": 1490
+    },
+    {
+      "epoch": 2.7726432532347505,
+      "grad_norm": 0.012029612436890602,
+      "learning_rate": 6.136783733826249e-05,
+      "loss": 0.0043,
+      "step": 1500
+    },
+    {
+      "epoch": 2.7726432532347505,
+      "eval_accuracy": 0.9980353634577603,
+      "eval_loss": 0.006946724373847246,
+      "eval_runtime": 52.203,
+      "eval_samples_per_second": 29.251,
+      "eval_steps_per_second": 3.659,
+      "step": 1500
+    },
+    {
+      "epoch": 2.791127541589649,
+      "grad_norm": 0.014309920370578766,
+      "learning_rate": 6.044362292051756e-05,
+      "loss": 0.0074,
+      "step": 1510
+    },
+    {
+      "epoch": 2.809611829944547,
+      "grad_norm": 3.063054323196411,
+      "learning_rate": 5.951940850277264e-05,
+      "loss": 0.0045,
+      "step": 1520
+    },
+    {
+      "epoch": 2.8280961182994453,
+      "grad_norm": 0.011617097072303295,
+      "learning_rate": 5.859519408502773e-05,
+      "loss": 0.0525,
+      "step": 1530
+    },
+    {
+      "epoch": 2.846580406654344,
+      "grad_norm": 5.252607345581055,
+      "learning_rate": 5.767097966728281e-05,
+      "loss": 0.0104,
+      "step": 1540
+    },
+    {
+      "epoch": 2.865064695009242,
+      "grad_norm": 0.014846362173557281,
+      "learning_rate": 5.674676524953789e-05,
+      "loss": 0.0265,
+      "step": 1550
+    },
+    {
+      "epoch": 2.8835489833641406,
+      "grad_norm": 0.011737200431525707,
+      "learning_rate": 5.5822550831792974e-05,
+      "loss": 0.0543,
+      "step": 1560
+    },
+    {
+      "epoch": 2.9020332717190387,
+      "grad_norm": 0.012772896327078342,
+      "learning_rate": 5.4898336414048056e-05,
+      "loss": 0.0018,
+      "step": 1570
+    },
+    {
+      "epoch": 2.9205175600739373,
+      "grad_norm": 0.06962817162275314,
+      "learning_rate": 5.397412199630314e-05,
+      "loss": 0.0234,
+      "step": 1580
+    },
+    {
+      "epoch": 2.9390018484288354,
+      "grad_norm": 0.019341696053743362,
+      "learning_rate": 5.304990757855823e-05,
+      "loss": 0.105,
+      "step": 1590
+    },
+    {
+      "epoch": 2.957486136783734,
+      "grad_norm": 4.673314571380615,
+      "learning_rate": 5.2125693160813314e-05,
+      "loss": 0.0501,
+      "step": 1600
+    },
+    {
+      "epoch": 2.957486136783734,
+      "eval_accuracy": 0.9967256057629339,
+      "eval_loss": 0.015068226493895054,
+      "eval_runtime": 51.6353,
+      "eval_samples_per_second": 29.573,
+      "eval_steps_per_second": 3.699,
+      "step": 1600
+    },
+    {
+      "epoch": 2.975970425138632,
+      "grad_norm": 0.018514908850193024,
+      "learning_rate": 5.1201478743068395e-05,
+      "loss": 0.0312,
+      "step": 1610
+    },
+    {
+      "epoch": 2.9944547134935307,
+      "grad_norm": 0.0645008459687233,
+      "learning_rate": 5.027726432532348e-05,
+      "loss": 0.0489,
+      "step": 1620
+    },
+    {
+      "epoch": 3.0129390018484288,
+      "grad_norm": 0.017880817875266075,
+      "learning_rate": 4.935304990757856e-05,
+      "loss": 0.0366,
+      "step": 1630
+    },
+    {
+      "epoch": 3.0314232902033273,
+      "grad_norm": 0.04122663289308548,
+      "learning_rate": 4.8428835489833646e-05,
+      "loss": 0.0539,
+      "step": 1640
+    },
+    {
+      "epoch": 3.0499075785582255,
+      "grad_norm": 0.022179430350661278,
+      "learning_rate": 4.750462107208873e-05,
+      "loss": 0.0248,
+      "step": 1650
+    },
+    {
+      "epoch": 3.068391866913124,
+      "grad_norm": 0.924117386341095,
+      "learning_rate": 4.658040665434381e-05,
+      "loss": 0.02,
+      "step": 1660
+    },
+    {
+      "epoch": 3.086876155268022,
+      "grad_norm": 0.01614381931722164,
+      "learning_rate": 4.565619223659889e-05,
+      "loss": 0.023,
+      "step": 1670
+    },
+    {
+      "epoch": 3.1053604436229207,
+      "grad_norm": 0.05051511153578758,
+      "learning_rate": 4.473197781885398e-05,
+      "loss": 0.0041,
+      "step": 1680
+    },
+    {
+      "epoch": 3.123844731977819,
+      "grad_norm": 0.02787856012582779,
+      "learning_rate": 4.380776340110906e-05,
+      "loss": 0.0163,
+      "step": 1690
+    },
+    {
+      "epoch": 3.142329020332717,
+      "grad_norm": 0.21667926013469696,
+      "learning_rate": 4.288354898336414e-05,
+      "loss": 0.0186,
+      "step": 1700
+    },
+    {
+      "epoch": 3.142329020332717,
+      "eval_accuracy": 0.9973804846103471,
+      "eval_loss": 0.007818276062607765,
+      "eval_runtime": 52.8582,
+      "eval_samples_per_second": 28.889,
+      "eval_steps_per_second": 3.613,
+      "step": 1700
+    },
+    {
+      "epoch": 3.1608133086876156,
+      "grad_norm": 0.02714550867676735,
+      "learning_rate": 4.195933456561922e-05,
+      "loss": 0.0178,
+      "step": 1710
+    },
+    {
+      "epoch": 3.1792975970425137,
+      "grad_norm": 0.5191987156867981,
+      "learning_rate": 4.1035120147874305e-05,
+      "loss": 0.0582,
+      "step": 1720
+    },
+    {
+      "epoch": 3.1977818853974123,
+      "grad_norm": 0.02666807919740677,
+      "learning_rate": 4.011090573012939e-05,
+      "loss": 0.007,
+      "step": 1730
+    },
+    {
+      "epoch": 3.2162661737523104,
+      "grad_norm": 0.06601597368717194,
+      "learning_rate": 3.9186691312384474e-05,
+      "loss": 0.0477,
+      "step": 1740
+    },
+    {
+      "epoch": 3.234750462107209,
+      "grad_norm": 0.0280216746032238,
+      "learning_rate": 3.826247689463956e-05,
+      "loss": 0.0048,
+      "step": 1750
+    },
+    {
+      "epoch": 3.253234750462107,
+      "grad_norm": 4.720592021942139,
+      "learning_rate": 3.7338262476894644e-05,
+      "loss": 0.0186,
+      "step": 1760
+    },
+    {
+      "epoch": 3.2717190388170057,
+      "grad_norm": 0.01574169471859932,
+      "learning_rate": 3.6414048059149726e-05,
+      "loss": 0.0017,
+      "step": 1770
+    },
+    {
+      "epoch": 3.290203327171904,
+      "grad_norm": 0.02533087506890297,
+      "learning_rate": 3.548983364140481e-05,
+      "loss": 0.0025,
+      "step": 1780
+    },
+    {
+      "epoch": 3.3086876155268024,
+      "grad_norm": 0.013142619282007217,
+      "learning_rate": 3.456561922365989e-05,
+      "loss": 0.0376,
+      "step": 1790
+    },
+    {
+      "epoch": 3.3271719038817005,
+      "grad_norm": 0.07316397875547409,
+      "learning_rate": 3.364140480591497e-05,
+      "loss": 0.0033,
+      "step": 1800
+    },
+    {
+      "epoch": 3.3271719038817005,
+      "eval_accuracy": 0.9960707269155207,
+      "eval_loss": 0.013949541375041008,
+      "eval_runtime": 53.0604,
+      "eval_samples_per_second": 28.779,
+      "eval_steps_per_second": 3.6,
+      "step": 1800
+    },
+    {
+      "epoch": 3.345656192236599,
+      "grad_norm": 0.015296310186386108,
+      "learning_rate": 3.271719038817006e-05,
+      "loss": 0.0015,
+      "step": 1810
+    },
+    {
+      "epoch": 3.364140480591497,
+      "grad_norm": 5.960048198699951,
+      "learning_rate": 3.179297597042514e-05,
+      "loss": 0.0222,
+      "step": 1820
+    },
+    {
+      "epoch": 3.3826247689463957,
+      "grad_norm": 0.21616186201572418,
+      "learning_rate": 3.086876155268023e-05,
+      "loss": 0.0038,
+      "step": 1830
+    },
+    {
+      "epoch": 3.401109057301294,
+      "grad_norm": 0.015051410533487797,
+      "learning_rate": 2.994454713493531e-05,
+      "loss": 0.0019,
+      "step": 1840
+    },
+    {
+      "epoch": 3.4195933456561924,
+      "grad_norm": 13.381204605102539,
+      "learning_rate": 2.902033271719039e-05,
+      "loss": 0.0182,
+      "step": 1850
+    },
+    {
+      "epoch": 3.4380776340110906,
+      "grad_norm": 0.1726062297821045,
+      "learning_rate": 2.8096118299445472e-05,
+      "loss": 0.0022,
+      "step": 1860
+    },
+    {
+      "epoch": 3.4565619223659887,
+      "grad_norm": 0.01701999455690384,
+      "learning_rate": 2.7171903881700557e-05,
+      "loss": 0.0014,
+      "step": 1870
+    },
+    {
+      "epoch": 3.4750462107208873,
+      "grad_norm": 0.013869056478142738,
+      "learning_rate": 2.624768946395564e-05,
+      "loss": 0.0013,
+      "step": 1880
+    },
+    {
+      "epoch": 3.4935304990757854,
+      "grad_norm": 0.021621432155370712,
+      "learning_rate": 2.532347504621072e-05,
+      "loss": 0.0016,
+      "step": 1890
+    },
+    {
+      "epoch": 3.512014787430684,
+      "grad_norm": 1.3106377124786377,
+      "learning_rate": 2.4399260628465805e-05,
+      "loss": 0.0023,
+      "step": 1900
+    },
+    {
+      "epoch": 3.512014787430684,
+      "eval_accuracy": 0.9986902423051736,
+      "eval_loss": 0.0075506423600018024,
+      "eval_runtime": 50.8135,
+      "eval_samples_per_second": 30.051,
+      "eval_steps_per_second": 3.759,
+      "step": 1900
+    },
+    {
+      "epoch": 3.530499075785582,
+      "grad_norm": 0.01985827274620533,
+      "learning_rate": 2.347504621072089e-05,
+      "loss": 0.0016,
+      "step": 1910
+    },
+    {
+      "epoch": 3.5489833641404807,
+      "grad_norm": 0.013897390104830265,
+      "learning_rate": 2.255083179297597e-05,
+      "loss": 0.0308,
+      "step": 1920
+    },
+    {
+      "epoch": 3.567467652495379,
+      "grad_norm": 0.009370139800012112,
+      "learning_rate": 2.1626617375231053e-05,
+      "loss": 0.0123,
+      "step": 1930
+    },
+    {
+      "epoch": 3.5859519408502774,
+      "grad_norm": 0.019544150680303574,
+      "learning_rate": 2.0702402957486137e-05,
+      "loss": 0.0257,
+      "step": 1940
+    },
+    {
+      "epoch": 3.6044362292051755,
+      "grad_norm": 0.018746808171272278,
+      "learning_rate": 1.9778188539741222e-05,
+      "loss": 0.03,
+      "step": 1950
+    },
+    {
+      "epoch": 3.622920517560074,
+      "grad_norm": 0.009196238592267036,
+      "learning_rate": 1.8853974121996304e-05,
+      "loss": 0.0011,
+      "step": 1960
+    },
+    {
+      "epoch": 3.641404805914972,
+      "grad_norm": 0.011442320421338081,
+      "learning_rate": 1.7929759704251385e-05,
+      "loss": 0.0012,
+      "step": 1970
+    },
+    {
+      "epoch": 3.6598890942698707,
+      "grad_norm": 0.010710498318076134,
+      "learning_rate": 1.700554528650647e-05,
+      "loss": 0.0019,
+      "step": 1980
+    },
+    {
+      "epoch": 3.678373382624769,
+      "grad_norm": 0.06102241575717926,
+      "learning_rate": 1.6081330868761555e-05,
+      "loss": 0.0012,
+      "step": 1990
+    },
+    {
+      "epoch": 3.6968576709796674,
+      "grad_norm": 0.008612744510173798,
+      "learning_rate": 1.5157116451016636e-05,
+      "loss": 0.0054,
+      "step": 2000
+    },
+    {
+      "epoch": 3.6968576709796674,
+      "eval_accuracy": 0.9993451211525868,
+      "eval_loss": 0.0047513521276414394,
+      "eval_runtime": 52.2618,
+      "eval_samples_per_second": 29.218,
+      "eval_steps_per_second": 3.655,
+      "step": 2000
+    },
+    {
+      "epoch": 3.7153419593345656,
+      "grad_norm": 0.008234468288719654,
+      "learning_rate": 1.423290203327172e-05,
+      "loss": 0.043,
+      "step": 2010
+    },
+    {
+      "epoch": 3.733826247689464,
+      "grad_norm": 0.008917649276554585,
+      "learning_rate": 1.3308687615526803e-05,
+      "loss": 0.0384,
+      "step": 2020
+    },
+    {
+      "epoch": 3.7523105360443623,
+      "grad_norm": 0.00844865757972002,
+      "learning_rate": 1.2384473197781886e-05,
+      "loss": 0.0013,
+      "step": 2030
+    },
+    {
+      "epoch": 3.7707948243992604,
+      "grad_norm": 0.008531128987669945,
+      "learning_rate": 1.1460258780036969e-05,
+      "loss": 0.0195,
+      "step": 2040
+    },
+    {
+      "epoch": 3.789279112754159,
+      "grad_norm": 0.009270643815398216,
+      "learning_rate": 1.0536044362292052e-05,
+      "loss": 0.0392,
+      "step": 2050
+    },
+    {
+      "epoch": 3.8077634011090575,
+      "grad_norm": 0.009245671331882477,
+      "learning_rate": 9.611829944547135e-06,
+      "loss": 0.0011,
+      "step": 2060
+    },
+    {
+      "epoch": 3.8262476894639557,
+      "grad_norm": 0.01690092496573925,
+      "learning_rate": 8.687615526802218e-06,
+      "loss": 0.0016,
+      "step": 2070
+    },
+    {
+      "epoch": 3.844731977818854,
+      "grad_norm": 0.015731679275631905,
+      "learning_rate": 7.763401109057302e-06,
+      "loss": 0.0317,
+      "step": 2080
+    },
+    {
+      "epoch": 3.8632162661737524,
+      "grad_norm": 3.0953285694122314,
+      "learning_rate": 6.931608133086876e-06,
+      "loss": 0.0454,
+      "step": 2090
+    },
+    {
+      "epoch": 3.8817005545286505,
+      "grad_norm": 6.279654502868652,
+      "learning_rate": 6.0073937153419595e-06,
+      "loss": 0.0168,
+      "step": 2100
+    },
+    {
+      "epoch": 3.8817005545286505,
+      "eval_accuracy": 0.9986902423051736,
+      "eval_loss": 0.006641203537583351,
+      "eval_runtime": 52.9204,
+      "eval_samples_per_second": 28.855,
+      "eval_steps_per_second": 3.609,
+      "step": 2100
+    },
+    {
+      "epoch": 3.900184842883549,
+      "grad_norm": 0.009602474048733711,
+      "learning_rate": 5.083179297597043e-06,
+      "loss": 0.0011,
+      "step": 2110
+    },
+    {
+      "epoch": 3.918669131238447,
+      "grad_norm": 12.240010261535645,
+      "learning_rate": 4.158964879852126e-06,
+      "loss": 0.0236,
+      "step": 2120
+    },
+    {
+      "epoch": 3.9371534195933457,
+      "grad_norm": 0.03988449275493622,
+      "learning_rate": 3.234750462107209e-06,
+      "loss": 0.0014,
+      "step": 2130
+    },
+    {
+      "epoch": 3.955637707948244,
+      "grad_norm": 5.554378986358643,
+      "learning_rate": 2.310536044362292e-06,
+      "loss": 0.0041,
+      "step": 2140
+    },
+    {
+      "epoch": 3.9741219963031424,
+      "grad_norm": 0.0083112558349967,
+      "learning_rate": 1.3863216266173753e-06,
+      "loss": 0.02,
+      "step": 2150
+    },
+    {
+      "epoch": 3.9926062846580406,
+      "grad_norm": 2.2959258556365967,
+      "learning_rate": 4.621072088724585e-07,
+      "loss": 0.0053,
+      "step": 2160
+    },
+    {
+      "epoch": 4.0,
+      "step": 2164,
+      "total_flos": 2.6818427765818e+18,
+      "train_loss": 0.0841421499820822,
+      "train_runtime": 2597.595,
+      "train_samples_per_second": 13.323,
+      "train_steps_per_second": 0.833
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2164,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.6818427765818e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}