diff --git "a/checkpoint-597/trainer_state.json" "b/checkpoint-597/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-597/trainer_state.json"
@@ -0,0 +1,4296 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9547738693467336,
+  "eval_steps": 50,
+  "global_step": 597,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6762334704399109,
+      "learning_rate": 1e-05,
+      "loss": 1.3026,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 1.343465805053711,
+      "eval_runtime": 2.9584,
+      "eval_samples_per_second": 33.802,
+      "eval_steps_per_second": 16.901,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.7722721695899963,
+      "learning_rate": 2e-05,
+      "loss": 1.5419,
+      "step": 2
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6532348394393921,
+      "learning_rate": 3e-05,
+      "loss": 1.4429,
+      "step": 3
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8427589535713196,
+      "learning_rate": 4e-05,
+      "loss": 1.4,
+      "step": 4
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.9355791807174683,
+      "learning_rate": 5e-05,
+      "loss": 1.2583,
+      "step": 5
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6357808113098145,
+      "learning_rate": 6e-05,
+      "loss": 1.2655,
+      "step": 6
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.963829517364502,
+      "learning_rate": 7e-05,
+      "loss": 1.42,
+      "step": 7
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6698102951049805,
+      "learning_rate": 8e-05,
+      "loss": 1.3938,
+      "step": 8
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5394894480705261,
+      "learning_rate": 9e-05,
+      "loss": 1.2234,
+      "step": 9
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.8773290514945984,
+      "learning_rate": 0.0001,
+      "loss": 1.4257,
+      "step": 10
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.7960235476493835,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.4272,
+      "step": 11
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.7909610867500305,
+      "learning_rate": 0.00012,
+      "loss": 1.352,
+      "step": 12
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.8417578339576721,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2048,
+      "step": 13
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.8076886534690857,
+      "learning_rate": 0.00014,
+      "loss": 1.4186,
+      "step": 14
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7543106079101562,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.0873,
+      "step": 15
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.9430835247039795,
+      "learning_rate": 0.00016,
+      "loss": 1.4061,
+      "step": 16
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.7473496794700623,
+      "learning_rate": 0.00017,
+      "loss": 1.1407,
+      "step": 17
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.8123806715011597,
+      "learning_rate": 0.00018,
+      "loss": 1.4394,
+      "step": 18
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.7778059244155884,
+      "learning_rate": 0.00019,
+      "loss": 1.2752,
+      "step": 19
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.7027471661567688,
+      "learning_rate": 0.0002,
+      "loss": 1.3107,
+      "step": 20
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.8443830609321594,
+      "learning_rate": 0.00019999918050612108,
+      "loss": 1.2204,
+      "step": 21
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.6853266358375549,
+      "learning_rate": 0.00019999672203791565,
+      "loss": 1.2231,
+      "step": 22
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.8757483959197998,
+      "learning_rate": 0.00019999262463567773,
+      "loss": 1.2069,
+      "step": 23
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.7184014320373535,
+      "learning_rate": 0.00019998688836656323,
+      "loss": 1.2124,
+      "step": 24
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.6530072093009949,
+      "learning_rate": 0.0001999795133245889,
+      "loss": 1.1672,
+      "step": 25
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.7211533188819885,
+      "learning_rate": 0.0001999704996306308,
+      "loss": 1.3207,
+      "step": 26
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.7048207521438599,
+      "learning_rate": 0.00019995984743242226,
+      "loss": 1.2003,
+      "step": 27
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.6881248354911804,
+      "learning_rate": 0.00019994755690455152,
+      "loss": 1.117,
+      "step": 28
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.7877801656723022,
+      "learning_rate": 0.00019993362824845875,
+      "loss": 1.0531,
+      "step": 29
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.749905526638031,
+      "learning_rate": 0.000199918061692433,
+      "loss": 1.1462,
+      "step": 30
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.67184978723526,
+      "learning_rate": 0.00019990085749160822,
+      "loss": 1.0939,
+      "step": 31
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6622844934463501,
+      "learning_rate": 0.0001998820159279591,
+      "loss": 1.1369,
+      "step": 32
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.763306736946106,
+      "learning_rate": 0.00019986153731029656,
+      "loss": 1.3525,
+      "step": 33
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.6171010136604309,
+      "learning_rate": 0.0001998394219742627,
+      "loss": 0.8807,
+      "step": 34
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.7575845718383789,
+      "learning_rate": 0.00019981567028232514,
+      "loss": 1.206,
+      "step": 35
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5694592595100403,
+      "learning_rate": 0.00019979028262377118,
+      "loss": 0.9079,
+      "step": 36
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.7056426405906677,
+      "learning_rate": 0.00019976325941470146,
+      "loss": 1.1133,
+      "step": 37
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.6812122464179993,
+      "learning_rate": 0.00019973460109802305,
+      "loss": 1.2707,
+      "step": 38
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5790569186210632,
+      "learning_rate": 0.0001997043081434423,
+      "loss": 1.0047,
+      "step": 39
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.6529936790466309,
+      "learning_rate": 0.00019967238104745696,
+      "loss": 1.0917,
+      "step": 40
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.6274911165237427,
+      "learning_rate": 0.00019963882033334826,
+      "loss": 1.2586,
+      "step": 41
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.6666668653488159,
+      "learning_rate": 0.00019960362655117218,
+      "loss": 1.1187,
+      "step": 42
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6239954233169556,
+      "learning_rate": 0.00019956680027775051,
+      "loss": 1.0343,
+      "step": 43
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6892250180244446,
+      "learning_rate": 0.0001995283421166614,
+      "loss": 1.0254,
+      "step": 44
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7392664551734924,
+      "learning_rate": 0.00019948825269822934,
+      "loss": 1.0592,
+      "step": 45
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7541553378105164,
+      "learning_rate": 0.00019944653267951504,
+      "loss": 1.2297,
+      "step": 46
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.685874342918396,
+      "learning_rate": 0.00019940318274430449,
+      "loss": 1.321,
+      "step": 47
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7901135087013245,
+      "learning_rate": 0.00019935820360309777,
+      "loss": 1.2583,
+      "step": 48
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.6619594693183899,
+      "learning_rate": 0.00019931159599309757,
+      "loss": 0.9762,
+      "step": 49
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.6059371829032898,
+      "learning_rate": 0.00019926336067819684,
+      "loss": 1.1146,
+      "step": 50
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.1476221084594727,
+      "eval_runtime": 2.9589,
+      "eval_samples_per_second": 33.796,
+      "eval_steps_per_second": 16.898,
+      "step": 50
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.6533025503158569,
+      "learning_rate": 0.00019921349844896654,
+      "loss": 1.2439,
+      "step": 51
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5473713278770447,
+      "learning_rate": 0.00019916201012264254,
+      "loss": 0.8464,
+      "step": 52
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.6035101413726807,
+      "learning_rate": 0.00019910889654311208,
+      "loss": 1.1297,
+      "step": 53
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.7092946767807007,
+      "learning_rate": 0.00019905415858090036,
+      "loss": 1.0365,
+      "step": 54
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.602556049823761,
+      "learning_rate": 0.00019899779713315575,
+      "loss": 1.1238,
+      "step": 55
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6566863059997559,
+      "learning_rate": 0.00019893981312363562,
+      "loss": 1.1097,
+      "step": 56
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.6582695245742798,
+      "learning_rate": 0.00019888020750269067,
+      "loss": 1.3681,
+      "step": 57
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.509901225566864,
+      "learning_rate": 0.00019881898124724981,
+      "loss": 0.7163,
+      "step": 58
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.6406445503234863,
+      "learning_rate": 0.0001987561353608038,
+      "loss": 1.1309,
+      "step": 59
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5770175457000732,
+      "learning_rate": 0.00019869167087338907,
+      "loss": 1.1706,
+      "step": 60
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.6582055687904358,
+      "learning_rate": 0.00019862558884157068,
+      "loss": 1.1121,
+      "step": 61
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.7646100521087646,
+      "learning_rate": 0.00019855789034842504,
+      "loss": 1.1313,
+      "step": 62
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.7127470970153809,
+      "learning_rate": 0.00019848857650352214,
+      "loss": 1.258,
+      "step": 63
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5721624493598938,
+      "learning_rate": 0.00019841764844290744,
+      "loss": 1.0163,
+      "step": 64
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.6494898796081543,
+      "learning_rate": 0.00019834510732908315,
+      "loss": 1.1974,
+      "step": 65
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.6703062057495117,
+      "learning_rate": 0.00019827095435098925,
+      "loss": 1.1376,
+      "step": 66
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.696711003780365,
+      "learning_rate": 0.000198195190723984,
+      "loss": 0.9931,
+      "step": 67
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.6563432216644287,
+      "learning_rate": 0.0001981178176898239,
+      "loss": 1.2047,
+      "step": 68
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.7269361019134521,
+      "learning_rate": 0.0001980388365166436,
+      "loss": 1.6113,
+      "step": 69
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.6356198191642761,
+      "learning_rate": 0.0001979582484989348,
+      "loss": 1.3778,
+      "step": 70
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6009278893470764,
+      "learning_rate": 0.00019787605495752528,
+      "loss": 1.2131,
+      "step": 71
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.49109163880348206,
+      "learning_rate": 0.00019779225723955707,
+      "loss": 0.8246,
+      "step": 72
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5709823966026306,
+      "learning_rate": 0.00019770685671846456,
+      "loss": 1.0578,
+      "step": 73
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5613502860069275,
+      "learning_rate": 0.0001976198547939518,
+      "loss": 0.8883,
+      "step": 74
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.750335156917572,
+      "learning_rate": 0.0001975312528919697,
+      "loss": 1.1836,
+      "step": 75
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.6157568693161011,
+      "learning_rate": 0.00019744105246469263,
+      "loss": 1.0637,
+      "step": 76
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.6417941451072693,
+      "learning_rate": 0.00019734925499049447,
+      "loss": 1.2824,
+      "step": 77
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.8214441537857056,
+      "learning_rate": 0.0001972558619739246,
+      "loss": 1.1942,
+      "step": 78
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6943228244781494,
+      "learning_rate": 0.00019716087494568317,
+      "loss": 1.3261,
+      "step": 79
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.739622950553894,
+      "learning_rate": 0.00019706429546259593,
+      "loss": 1.2639,
+      "step": 80
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.6374944448471069,
+      "learning_rate": 0.00019696612510758876,
+      "loss": 0.9929,
+      "step": 81
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.7595279812812805,
+      "learning_rate": 0.00019686636548966178,
+      "loss": 1.2859,
+      "step": 82
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.6465960144996643,
+      "learning_rate": 0.00019676501824386294,
+      "loss": 1.0333,
+      "step": 83
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.7063401341438293,
+      "learning_rate": 0.00019666208503126112,
+      "loss": 1.2189,
+      "step": 84
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.631826639175415,
+      "learning_rate": 0.00019655756753891916,
+      "loss": 1.2583,
+      "step": 85
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.6506052017211914,
+      "learning_rate": 0.0001964514674798659,
+      "loss": 1.2019,
+      "step": 86
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.7421661615371704,
+      "learning_rate": 0.00019634378659306832,
+      "loss": 1.2122,
+      "step": 87
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5749310851097107,
+      "learning_rate": 0.00019623452664340306,
+      "loss": 1.0522,
+      "step": 88
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.6523499488830566,
+      "learning_rate": 0.0001961236894216272,
+      "loss": 1.2135,
+      "step": 89
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5970554947853088,
+      "learning_rate": 0.00019601127674434928,
+      "loss": 1.0297,
+      "step": 90
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.587348461151123,
+      "learning_rate": 0.00019589729045399934,
+      "loss": 1.0214,
+      "step": 91
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.6518609523773193,
+      "learning_rate": 0.00019578173241879872,
+      "loss": 0.9928,
+      "step": 92
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.7513082027435303,
+      "learning_rate": 0.00019566460453272945,
+      "loss": 1.1204,
+      "step": 93
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.8648024201393127,
+      "learning_rate": 0.0001955459087155033,
+      "loss": 1.3671,
+      "step": 94
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6207080483436584,
+      "learning_rate": 0.0001954256469125301,
+      "loss": 1.1286,
+      "step": 95
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6174007058143616,
+      "learning_rate": 0.0001953038210948861,
+      "loss": 1.145,
+      "step": 96
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.6160337328910828,
+      "learning_rate": 0.00019518043325928157,
+      "loss": 1.2688,
+      "step": 97
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.662702202796936,
+      "learning_rate": 0.00019505548542802804,
+      "loss": 1.1212,
+      "step": 98
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7133952379226685,
+      "learning_rate": 0.00019492897964900512,
+      "loss": 1.0514,
+      "step": 99
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7767614126205444,
+      "learning_rate": 0.00019480091799562704,
+      "loss": 1.2387,
+      "step": 100
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.1319388151168823,
+      "eval_runtime": 2.9089,
+      "eval_samples_per_second": 34.377,
+      "eval_steps_per_second": 17.189,
+      "step": 100
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.6398429870605469,
+      "learning_rate": 0.00019467130256680868,
+      "loss": 1.0076,
+      "step": 101
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.6510715484619141,
+      "learning_rate": 0.00019454013548693102,
+      "loss": 1.2372,
+      "step": 102
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7204650044441223,
+      "learning_rate": 0.00019440741890580643,
+      "loss": 1.0999,
+      "step": 103
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6531095504760742,
+      "learning_rate": 0.00019427315499864344,
+      "loss": 1.1123,
+      "step": 104
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.5871708989143372,
+      "learning_rate": 0.00019413734596601104,
+      "loss": 1.2162,
+      "step": 105
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.6323477625846863,
+      "learning_rate": 0.00019399999403380266,
+      "loss": 1.1369,
+      "step": 106
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.6977123618125916,
+      "learning_rate": 0.00019386110145319963,
+      "loss": 1.0952,
+      "step": 107
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.6638639569282532,
+      "learning_rate": 0.00019372067050063438,
+      "loss": 1.1125,
+      "step": 108
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.6010698676109314,
+      "learning_rate": 0.000193578703477753,
+      "loss": 1.1715,
+      "step": 109
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.5837023258209229,
+      "learning_rate": 0.00019343520271137763,
+      "loss": 0.8489,
+      "step": 110
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.6870157718658447,
+      "learning_rate": 0.0001932901705534683,
+      "loss": 1.0953,
+      "step": 111
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5713046789169312,
+      "learning_rate": 0.00019314360938108425,
+      "loss": 1.1113,
+      "step": 112
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.5966447591781616,
+      "learning_rate": 0.00019299552159634517,
+      "loss": 1.2646,
+      "step": 113
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.6116918921470642,
+      "learning_rate": 0.00019284590962639176,
+      "loss": 1.0807,
+      "step": 114
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.5885886549949646,
+      "learning_rate": 0.0001926947759233459,
+      "loss": 0.9551,
+      "step": 115
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.5844876766204834,
+      "learning_rate": 0.00019254212296427044,
+      "loss": 1.0009,
+      "step": 116
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.5967299342155457,
+      "learning_rate": 0.0001923879532511287,
+      "loss": 0.863,
+      "step": 117
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.543732762336731,
+      "learning_rate": 0.0001922322693107434,
+      "loss": 0.8331,
+      "step": 118
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.6925728917121887,
+      "learning_rate": 0.0001920750736947553,
+      "loss": 1.1044,
+      "step": 119
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5720507502555847,
+      "learning_rate": 0.00019191636897958122,
+      "loss": 1.2173,
+      "step": 120
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.6664772033691406,
+      "learning_rate": 0.0001917561577663721,
+      "loss": 0.9849,
+      "step": 121
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.6026978492736816,
+      "learning_rate": 0.00019159444268097012,
+      "loss": 1.2952,
+      "step": 122
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.6648169755935669,
+      "learning_rate": 0.00019143122637386566,
+      "loss": 0.8417,
+      "step": 123
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.7643215656280518,
+      "learning_rate": 0.00019126651152015403,
+      "loss": 1.1142,
+      "step": 124
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.6389123797416687,
+      "learning_rate": 0.00019110030081949156,
+      "loss": 1.2387,
+      "step": 125
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.7826026678085327,
+      "learning_rate": 0.00019093259699605125,
+      "loss": 1.1407,
+      "step": 126
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6801394820213318,
+      "learning_rate": 0.0001907634027984782,
+      "loss": 0.932,
+      "step": 127
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6450052857398987,
+      "learning_rate": 0.0001905927209998447,
+      "loss": 1.3197,
+      "step": 128
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.6216878890991211,
+      "learning_rate": 0.00019042055439760444,
+      "loss": 1.2593,
+      "step": 129
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.6000977158546448,
+      "learning_rate": 0.000190246905813547,
+      "loss": 0.9974,
+      "step": 130
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.5806196928024292,
+      "learning_rate": 0.0001900717780937514,
+      "loss": 1.1792,
+      "step": 131
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.6986164450645447,
+      "learning_rate": 0.00018989517410853955,
+      "loss": 1.252,
+      "step": 132
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.6852320432662964,
+      "learning_rate": 0.0001897170967524291,
+      "loss": 1.098,
+      "step": 133
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.6186272501945496,
+      "learning_rate": 0.00018953754894408616,
+      "loss": 1.1099,
+      "step": 134
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.7196840643882751,
+      "learning_rate": 0.0001893565336262773,
+      "loss": 1.1809,
+      "step": 135
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.6523413062095642,
+      "learning_rate": 0.00018917405376582145,
+      "loss": 1.2383,
+      "step": 136
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7788291573524475,
+      "learning_rate": 0.00018899011235354115,
+      "loss": 1.023,
+      "step": 137
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.5616946220397949,
+      "learning_rate": 0.00018880471240421365,
+      "loss": 0.8242,
+      "step": 138
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.6670994758605957,
+      "learning_rate": 0.00018861785695652142,
+      "loss": 1.2797,
+      "step": 139
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.6285648345947266,
+      "learning_rate": 0.00018842954907300236,
+      "loss": 1.0959,
+      "step": 140
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.6495100855827332,
+      "learning_rate": 0.00018823979183999964,
+      "loss": 1.1426,
+      "step": 141
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.7513198256492615,
+      "learning_rate": 0.00018804858836761107,
+      "loss": 1.2578,
+      "step": 142
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5422288775444031,
+      "learning_rate": 0.0001878559417896382,
+      "loss": 0.9833,
+      "step": 143
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.605277419090271,
+      "learning_rate": 0.0001876618552635348,
+      "loss": 1.2323,
+      "step": 144
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.7177323698997498,
+      "learning_rate": 0.00018746633197035527,
+      "loss": 1.2153,
+      "step": 145
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.5417729020118713,
+      "learning_rate": 0.00018726937511470246,
+      "loss": 0.9367,
+      "step": 146
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.6895157098770142,
+      "learning_rate": 0.00018707098792467515,
+      "loss": 1.3363,
+      "step": 147
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.5565975308418274,
+      "learning_rate": 0.00018687117365181512,
+      "loss": 1.0385,
+      "step": 148
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7168130278587341,
+      "learning_rate": 0.00018666993557105377,
+      "loss": 1.2281,
+      "step": 149
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.839598536491394,
+      "learning_rate": 0.00018646727698065865,
+      "loss": 1.4159,
+      "step": 150
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.119249939918518,
+      "eval_runtime": 2.9417,
+      "eval_samples_per_second": 33.994,
+      "eval_steps_per_second": 16.997,
+      "step": 150
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5981218814849854,
+      "learning_rate": 0.00018626320120217923,
+      "loss": 1.0671,
+      "step": 151
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6944805383682251,
+      "learning_rate": 0.00018605771158039253,
+      "loss": 1.3229,
+      "step": 152
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.6238952875137329,
+      "learning_rate": 0.00018585081148324832,
+      "loss": 1.1578,
+      "step": 153
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.6363958120346069,
+      "learning_rate": 0.00018564250430181387,
+      "loss": 1.3265,
+      "step": 154
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.5761409401893616,
+      "learning_rate": 0.00018543279345021834,
+      "loss": 1.1844,
+      "step": 155
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.810093104839325,
+      "learning_rate": 0.00018522168236559695,
+      "loss": 1.2033,
+      "step": 156
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.7487497329711914,
+      "learning_rate": 0.0001850091745080345,
+      "loss": 1.1043,
+      "step": 157
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.6162795424461365,
+      "learning_rate": 0.00018479527336050878,
+      "loss": 1.2486,
+      "step": 158
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5720970034599304,
+      "learning_rate": 0.00018457998242883344,
+      "loss": 1.0381,
+      "step": 159
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.6686292886734009,
+      "learning_rate": 0.00018436330524160047,
+      "loss": 1.502,
+      "step": 160
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.5931655764579773,
+      "learning_rate": 0.00018414524535012244,
+      "loss": 1.0813,
+      "step": 161
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.6548634171485901,
+      "learning_rate": 0.00018392580632837423,
+      "loss": 1.3147,
+      "step": 162
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.559681236743927,
+      "learning_rate": 0.00018370499177293464,
+      "loss": 1.1096,
+      "step": 163
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.6365666389465332,
+      "learning_rate": 0.00018348280530292713,
+      "loss": 1.2062,
+      "step": 164
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.616242527961731,
+      "learning_rate": 0.00018325925055996076,
+      "loss": 1.1219,
+      "step": 165
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.6588903069496155,
+      "learning_rate": 0.0001830343312080704,
+      "loss": 1.2697,
+      "step": 166
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5880855321884155,
+      "learning_rate": 0.00018280805093365672,
+      "loss": 1.1511,
+      "step": 167
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.7549880743026733,
+      "learning_rate": 0.00018258041344542566,
+      "loss": 1.2181,
+      "step": 168
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.6862443089485168,
+      "learning_rate": 0.00018235142247432782,
+      "loss": 1.8496,
+      "step": 169
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.5903118848800659,
+      "learning_rate": 0.0001821210817734972,
+      "loss": 1.2092,
+      "step": 170
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.6936279535293579,
+      "learning_rate": 0.00018188939511818965,
+      "loss": 1.0635,
+      "step": 171
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.6887457370758057,
+      "learning_rate": 0.0001816563663057211,
+      "loss": 0.9387,
+      "step": 172
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.6930254101753235,
+      "learning_rate": 0.00018142199915540527,
+      "loss": 1.1651,
+      "step": 173
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.6529977321624756,
+      "learning_rate": 0.00018118629750849105,
+      "loss": 1.2512,
+      "step": 174
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.705954372882843,
+      "learning_rate": 0.0001809492652280996,
+      "loss": 1.2601,
+      "step": 175
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6263706088066101,
+      "learning_rate": 0.00018071090619916093,
+      "loss": 1.0446,
+      "step": 176
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.7754440307617188,
+      "learning_rate": 0.00018047122432835038,
+      "loss": 1.2517,
+      "step": 177
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.6904909610748291,
+      "learning_rate": 0.0001802302235440245,
+      "loss": 1.3028,
+      "step": 178
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.6373815536499023,
+      "learning_rate": 0.0001799879077961566,
+      "loss": 0.7538,
+      "step": 179
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.6192349791526794,
+      "learning_rate": 0.00017974428105627208,
+      "loss": 1.1583,
+      "step": 180
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6500440239906311,
+      "learning_rate": 0.00017949934731738347,
+      "loss": 1.189,
+      "step": 181
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.5701293349266052,
+      "learning_rate": 0.0001792531105939247,
+      "loss": 0.9937,
+      "step": 182
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.6383854150772095,
+      "learning_rate": 0.0001790055749216856,
+      "loss": 1.0381,
+      "step": 183
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.7212352156639099,
+      "learning_rate": 0.00017875674435774547,
+      "loss": 1.2023,
+      "step": 184
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.7195665836334229,
+      "learning_rate": 0.00017850662298040678,
+      "loss": 1.4138,
+      "step": 185
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.6174137592315674,
+      "learning_rate": 0.0001782552148891283,
+      "loss": 0.8007,
+      "step": 186
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.672179102897644,
+      "learning_rate": 0.00017800252420445788,
+      "loss": 1.1403,
+      "step": 187
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.6487817168235779,
+      "learning_rate": 0.00017774855506796496,
+      "loss": 1.169,
+      "step": 188
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.7027740478515625,
+      "learning_rate": 0.0001774933116421725,
+      "loss": 1.2268,
+      "step": 189
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.7178415060043335,
+      "learning_rate": 0.00017723679811048904,
+      "loss": 1.2785,
+      "step": 190
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.682354748249054,
+      "learning_rate": 0.00017697901867713995,
+      "loss": 1.2195,
+      "step": 191
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.7199010252952576,
+      "learning_rate": 0.00017671997756709863,
+      "loss": 1.4132,
+      "step": 192
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7743118405342102,
+      "learning_rate": 0.0001764596790260171,
+      "loss": 0.9824,
+      "step": 193
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7540227174758911,
+      "learning_rate": 0.00017619812732015664,
+      "loss": 1.0527,
+      "step": 194
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6113067269325256,
+      "learning_rate": 0.00017593532673631766,
+      "loss": 1.2446,
+      "step": 195
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6951828598976135,
+      "learning_rate": 0.00017567128158176953,
+      "loss": 1.3333,
+      "step": 196
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.570866584777832,
+      "learning_rate": 0.00017540599618418007,
+      "loss": 1.0012,
+      "step": 197
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.5432811379432678,
+      "learning_rate": 0.00017513947489154443,
+      "loss": 1.1343,
+      "step": 198
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.6711558103561401,
+      "learning_rate": 0.00017487172207211396,
+      "loss": 1.0945,
+      "step": 199
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.675626814365387,
+      "learning_rate": 0.0001746027421143246,
+      "loss": 1.2807,
+      "step": 200
+    },
+    {
+      "epoch": 1.01,
+      "eval_loss": 1.1153115034103394,
+      "eval_runtime": 3.0007,
+      "eval_samples_per_second": 33.326,
+      "eval_steps_per_second": 16.663,
+      "step": 200
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.6204088926315308,
+      "learning_rate": 0.00017433253942672496,
+      "loss": 1.2167,
+      "step": 201
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.6080848574638367,
+      "learning_rate": 0.000174061118437904,
+      "loss": 0.979,
+      "step": 202
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.8325397372245789,
+      "learning_rate": 0.00017378848359641847,
+      "loss": 0.9095,
+      "step": 203
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.6108893752098083,
+      "learning_rate": 0.00017351463937072004,
+      "loss": 1.0784,
+      "step": 204
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.6140009164810181,
+      "learning_rate": 0.00017323959024908209,
+      "loss": 1.131,
+      "step": 205
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.7503536343574524,
+      "learning_rate": 0.00017296334073952605,
+      "loss": 1.0152,
+      "step": 206
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.6903036236763,
+      "learning_rate": 0.0001726858953697475,
+      "loss": 1.1751,
+      "step": 207
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.6842136979103088,
+      "learning_rate": 0.00017240725868704218,
+      "loss": 0.9362,
+      "step": 208
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.6317443251609802,
+      "learning_rate": 0.00017212743525823112,
+      "loss": 1.0199,
+      "step": 209
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.6331597566604614,
+      "learning_rate": 0.0001718464296695861,
+      "loss": 0.8634,
+      "step": 210
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.7953663468360901,
+      "learning_rate": 0.0001715642465267543,
+      "loss": 1.0635,
+      "step": 211
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.6130046248435974,
+      "learning_rate": 0.00017128089045468294,
+      "loss": 0.8426,
+      "step": 212
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.5984789729118347,
+      "learning_rate": 0.00017099636609754329,
+      "loss": 0.7435,
+      "step": 213
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.8032707571983337,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.9271,
+      "step": 214
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.78273606300354,
+      "learning_rate": 0.00017042383120040834,
+      "loss": 0.8695,
+      "step": 215
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.7779294848442078,
+      "learning_rate": 0.00017013583004418993,
+      "loss": 1.085,
+      "step": 216
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.7201984524726868,
+      "learning_rate": 0.00016984667937030318,
+      "loss": 0.8079,
+      "step": 217
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.6246169805526733,
+      "learning_rate": 0.00016955638391789228,
+      "loss": 0.7941,
+      "step": 218
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.7627923488616943,
+      "learning_rate": 0.00016926494844486412,
+      "loss": 0.9281,
+      "step": 219
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.6979169249534607,
+      "learning_rate": 0.00016897237772781044,
+      "loss": 0.8461,
+      "step": 220
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.7872811555862427,
+      "learning_rate": 0.00016867867656192946,
+      "loss": 0.9413,
+      "step": 221
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.7482172846794128,
+      "learning_rate": 0.00016838384976094738,
+      "loss": 0.9107,
+      "step": 222
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.8587368130683899,
+      "learning_rate": 0.00016808790215703935,
+      "loss": 0.9886,
+      "step": 223
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.732606828212738,
+      "learning_rate": 0.00016779083860075033,
+      "loss": 0.6831,
+      "step": 224
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.9272279143333435,
+      "learning_rate": 0.0001674926639609157,
+      "loss": 1.1396,
+      "step": 225
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.6473172307014465,
+      "learning_rate": 0.00016719338312458124,
+      "loss": 0.8299,
+      "step": 226
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.8427954316139221,
+      "learning_rate": 0.00016689300099692332,
+      "loss": 0.9203,
+      "step": 227
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.8205825090408325,
+      "learning_rate": 0.00016659152250116812,
+      "loss": 0.8532,
+      "step": 228
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.7522780299186707,
+      "learning_rate": 0.00016628895257851135,
+      "loss": 0.7687,
+      "step": 229
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.8582683205604553,
+      "learning_rate": 0.000165985296188037,
+      "loss": 0.9217,
+      "step": 230
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.8408709168434143,
+      "learning_rate": 0.0001656805583066361,
+      "loss": 1.0371,
+      "step": 231
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.9851942658424377,
+      "learning_rate": 0.00016537474392892528,
+      "loss": 1.044,
+      "step": 232
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.8463842868804932,
+      "learning_rate": 0.00016506785806716465,
+      "loss": 0.9521,
+      "step": 233
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.825255811214447,
+      "learning_rate": 0.00016475990575117605,
+      "loss": 0.8524,
+      "step": 234
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.1519947052001953,
+      "learning_rate": 0.0001644508920282601,
+      "loss": 0.9906,
+      "step": 235
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.8102966547012329,
+      "learning_rate": 0.000164140821963114,
+      "loss": 0.9192,
+      "step": 236
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.0159798860549927,
+      "learning_rate": 0.0001638297006377481,
+      "loss": 1.0234,
+      "step": 237
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.0157923698425293,
+      "learning_rate": 0.00016351753315140287,
+      "loss": 0.8921,
+      "step": 238
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.8466264009475708,
+      "learning_rate": 0.00016320432462046516,
+      "loss": 0.7098,
+      "step": 239
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.8298121690750122,
+      "learning_rate": 0.00016289008017838445,
+      "loss": 0.8517,
+      "step": 240
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.2163349390029907,
+      "learning_rate": 0.00016257480497558873,
+      "loss": 1.1172,
+      "step": 241
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.9839556217193604,
+      "learning_rate": 0.0001622585041793999,
+      "loss": 1.1022,
+      "step": 242
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.7986888289451599,
+      "learning_rate": 0.00016194118297394936,
+      "loss": 0.7826,
+      "step": 243
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.9318971037864685,
+      "learning_rate": 0.00016162284656009274,
+      "loss": 0.8899,
+      "step": 244
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.0234252214431763,
+      "learning_rate": 0.00016130350015532496,
+      "loss": 0.8831,
+      "step": 245
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.8264230489730835,
+      "learning_rate": 0.00016098314899369446,
+      "loss": 1.1389,
+      "step": 246
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.8845193982124329,
+      "learning_rate": 0.0001606617983257176,
+      "loss": 1.0822,
+      "step": 247
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.9044338464736938,
+      "learning_rate": 0.00016033945341829248,
+      "loss": 1.0556,
+      "step": 248
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.9660309553146362,
+      "learning_rate": 0.00016001611955461265,
+      "loss": 1.0331,
+      "step": 249
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.0728851556777954,
+      "learning_rate": 0.0001596918020340805,
+      "loss": 1.0465,
+      "step": 250
+    },
+    {
+      "epoch": 1.24,
+      "eval_loss": 1.1568788290023804,
+      "eval_runtime": 2.9063,
+      "eval_samples_per_second": 34.408,
+      "eval_steps_per_second": 17.204,
+      "step": 250
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.9447798728942871,
+      "learning_rate": 0.00015936650617222063,
+      "loss": 0.9487,
+      "step": 251
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.0429767370224,
+      "learning_rate": 0.00015904023730059228,
+      "loss": 1.006,
+      "step": 252
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.9871753454208374,
+      "learning_rate": 0.00015871300076670234,
+      "loss": 0.9494,
+      "step": 253
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.7644299268722534,
+      "learning_rate": 0.00015838480193391754,
+      "loss": 0.6077,
+      "step": 254
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.1654846668243408,
+      "learning_rate": 0.0001580556461813766,
+      "loss": 1.0632,
+      "step": 255
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.0508393049240112,
+      "learning_rate": 0.00015772553890390197,
+      "loss": 0.8754,
+      "step": 256
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.8676743507385254,
+      "learning_rate": 0.0001573944855119115,
+      "loss": 1.007,
+      "step": 257
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.178464412689209,
+      "learning_rate": 0.00015706249143132982,
+      "loss": 1.041,
+      "step": 258
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.0226370096206665,
+      "learning_rate": 0.00015672956210349923,
+      "loss": 1.1114,
+      "step": 259
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.9840787649154663,
+      "learning_rate": 0.00015639570298509064,
+      "loss": 0.9043,
+      "step": 260
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.0564519166946411,
+      "learning_rate": 0.0001560609195480142,
+      "loss": 0.9696,
+      "step": 261
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.9174713492393494,
+      "learning_rate": 0.00015572521727932935,
+      "loss": 0.9849,
+      "step": 262
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.7333153486251831,
+      "learning_rate": 0.00015538860168115527,
+      "loss": 0.7286,
+      "step": 263
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.9282216429710388,
+      "learning_rate": 0.00015505107827058036,
+      "loss": 0.8975,
+      "step": 264
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.003192663192749,
+      "learning_rate": 0.00015471265257957202,
+      "loss": 1.1836,
+      "step": 265
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.8726491928100586,
+      "learning_rate": 0.00015437333015488587,
+      "loss": 0.9313,
+      "step": 266
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.9721888899803162,
+      "learning_rate": 0.00015403311655797492,
+      "loss": 0.8935,
+      "step": 267
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.0440247058868408,
+      "learning_rate": 0.0001536920173648984,
+      "loss": 0.9741,
+      "step": 268
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.9814698100090027,
+      "learning_rate": 0.00015335003816623028,
+      "loss": 0.8982,
+      "step": 269
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.904926598072052,
+      "learning_rate": 0.00015300718456696778,
+      "loss": 0.8579,
+      "step": 270
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.0483490228652954,
+      "learning_rate": 0.00015266346218643947,
+      "loss": 0.8108,
+      "step": 271
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.9156501293182373,
+      "learning_rate": 0.000152318876658213,
+      "loss": 0.9442,
+      "step": 272
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.9268532395362854,
+      "learning_rate": 0.00015197343363000307,
+      "loss": 0.8243,
+      "step": 273
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.8675321340560913,
+      "learning_rate": 0.00015162713876357858,
+      "loss": 0.7758,
+      "step": 274
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.9131675362586975,
+      "learning_rate": 0.00015127999773467002,
+      "loss": 0.8845,
+      "step": 275
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.0260008573532104,
+      "learning_rate": 0.00015093201623287631,
+      "loss": 0.9032,
+      "step": 276
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.044528841972351,
+      "learning_rate": 0.00015058319996157172,
+      "loss": 1.0489,
+      "step": 277
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.9959388375282288,
+      "learning_rate": 0.0001502335546378122,
+      "loss": 0.858,
+      "step": 278
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.8414021730422974,
+      "learning_rate": 0.00014988308599224183,
+      "loss": 0.782,
+      "step": 279
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.9205671548843384,
+      "learning_rate": 0.00014953179976899878,
+      "loss": 0.8376,
+      "step": 280
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.9481040239334106,
+      "learning_rate": 0.0001491797017256212,
+      "loss": 0.851,
+      "step": 281
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.8266577124595642,
+      "learning_rate": 0.00014882679763295306,
+      "loss": 0.7228,
+      "step": 282
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.0222742557525635,
+      "learning_rate": 0.0001484730932750491,
+      "loss": 0.7955,
+      "step": 283
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.0014468431472778,
+      "learning_rate": 0.00014811859444908052,
+      "loss": 0.9107,
+      "step": 284
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.9157910346984863,
+      "learning_rate": 0.00014776330696523963,
+      "loss": 1.0208,
+      "step": 285
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.0565227270126343,
+      "learning_rate": 0.00014740723664664483,
+      "loss": 0.6496,
+      "step": 286
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.0323175191879272,
+      "learning_rate": 0.00014705038932924503,
+      "loss": 1.0043,
+      "step": 287
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.0063213109970093,
+      "learning_rate": 0.00014669277086172406,
+      "loss": 1.1286,
+      "step": 288
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.8602890968322754,
+      "learning_rate": 0.00014633438710540489,
+      "loss": 0.7254,
+      "step": 289
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.9782769083976746,
+      "learning_rate": 0.00014597524393415335,
+      "loss": 0.7086,
+      "step": 290
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.9836515784263611,
+      "learning_rate": 0.00014561534723428205,
+      "loss": 0.8405,
+      "step": 291
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.0674123764038086,
+      "learning_rate": 0.00014525470290445392,
+      "loss": 1.0317,
+      "step": 292
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.9632031917572021,
+      "learning_rate": 0.00014489331685558525,
+      "loss": 0.9473,
+      "step": 293
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.0105828046798706,
+      "learning_rate": 0.00014453119501074924,
+      "loss": 0.8199,
+      "step": 294
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.0012938976287842,
+      "learning_rate": 0.00014416834330507856,
+      "loss": 0.9099,
+      "step": 295
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.0367400646209717,
+      "learning_rate": 0.00014380476768566824,
+      "loss": 1.0958,
+      "step": 296
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.7329337000846863,
+      "learning_rate": 0.00014344047411147818,
+      "loss": 0.6189,
+      "step": 297
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.9014643430709839,
+      "learning_rate": 0.00014307546855323549,
+      "loss": 0.8168,
+      "step": 298
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.7568360567092896,
+      "learning_rate": 0.00014270975699333654,
+      "loss": 0.7857,
+      "step": 299
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.9573479890823364,
+      "learning_rate": 0.00014234334542574906,
+      "loss": 0.9577,
+      "step": 300
+    },
+    {
+      "epoch": 1.49,
+      "eval_loss": 1.149274230003357,
+      "eval_runtime": 2.9222,
+      "eval_samples_per_second": 34.22,
+      "eval_steps_per_second": 17.11,
+      "step": 300
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.8575048446655273,
+      "learning_rate": 0.00014197623985591373,
+      "loss": 0.8521,
+      "step": 301
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.990980863571167,
+      "learning_rate": 0.00014160844630064595,
+      "loss": 1.0642,
+      "step": 302
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.1145374774932861,
+      "learning_rate": 0.00014123997078803707,
+      "loss": 0.8963,
+      "step": 303
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.9661507606506348,
+      "learning_rate": 0.00014087081935735564,
+      "loss": 0.9473,
+      "step": 304
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.019618272781372,
+      "learning_rate": 0.00014050099805894837,
+      "loss": 0.9048,
+      "step": 305
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.871661365032196,
+      "learning_rate": 0.00014013051295414108,
+      "loss": 0.6644,
+      "step": 306
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.9834782481193542,
+      "learning_rate": 0.00013975937011513932,
+      "loss": 0.9226,
+      "step": 307
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.9938518404960632,
+      "learning_rate": 0.00013938757562492873,
+      "loss": 0.9608,
+      "step": 308
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.0692541599273682,
+      "learning_rate": 0.00013901513557717553,
+      "loss": 0.9646,
+      "step": 309
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.039904236793518,
+      "learning_rate": 0.00013864205607612648,
+      "loss": 0.7799,
+      "step": 310
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.9138852953910828,
+      "learning_rate": 0.000138268343236509,
+      "loss": 0.8297,
+      "step": 311
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.01775324344635,
+      "learning_rate": 0.00013789400318343068,
+      "loss": 0.8992,
+      "step": 312
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.0052934885025024,
+      "learning_rate": 0.0001375190420522792,
+      "loss": 0.8212,
+      "step": 313
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.0567269325256348,
+      "learning_rate": 0.00013714346598862166,
+      "loss": 1.0402,
+      "step": 314
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.8707680702209473,
+      "learning_rate": 0.00013676728114810367,
+      "loss": 0.8864,
+      "step": 315
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.959578812122345,
+      "learning_rate": 0.00013639049369634876,
+      "loss": 0.7048,
+      "step": 316
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.0675721168518066,
+      "learning_rate": 0.00013601310980885714,
+      "loss": 1.0025,
+      "step": 317
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.8831722736358643,
+      "learning_rate": 0.0001356351356709045,
+      "loss": 0.8058,
+      "step": 318
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.0400885343551636,
+      "learning_rate": 0.00013525657747744072,
+      "loss": 1.0273,
+      "step": 319
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.0046364068984985,
+      "learning_rate": 0.00013487744143298822,
+      "loss": 0.8441,
+      "step": 320
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.029714822769165,
+      "learning_rate": 0.0001344977337515404,
+      "loss": 0.7771,
+      "step": 321
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.8168841004371643,
+      "learning_rate": 0.0001341174606564596,
+      "loss": 0.8024,
+      "step": 322
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.9833108186721802,
+      "learning_rate": 0.00013373662838037537,
+      "loss": 0.9065,
+      "step": 323
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.9366996884346008,
+      "learning_rate": 0.00013335524316508208,
+      "loss": 0.9436,
+      "step": 324
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.8757138848304749,
+      "learning_rate": 0.00013297331126143667,
+      "loss": 0.8399,
+      "step": 325
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.1467972993850708,
+      "learning_rate": 0.00013259083892925633,
+      "loss": 1.1416,
+      "step": 326
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.9916189312934875,
+      "learning_rate": 0.00013220783243721572,
+      "loss": 0.9531,
+      "step": 327
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.9911974668502808,
+      "learning_rate": 0.0001318242980627444,
+      "loss": 0.9476,
+      "step": 328
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.0219913721084595,
+      "learning_rate": 0.0001314402420919238,
+      "loss": 0.9288,
+      "step": 329
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.0889464616775513,
+      "learning_rate": 0.00013105567081938424,
+      "loss": 0.8025,
+      "step": 330
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.8797928690910339,
+      "learning_rate": 0.00013067059054820183,
+      "loss": 0.9002,
+      "step": 331
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.0043346881866455,
+      "learning_rate": 0.00013028500758979506,
+      "loss": 0.8971,
+      "step": 332
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.9221352934837341,
+      "learning_rate": 0.00012989892826382145,
+      "loss": 0.8181,
+      "step": 333
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.2053778171539307,
+      "learning_rate": 0.00012951235889807386,
+      "loss": 0.9374,
+      "step": 334
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.2230528593063354,
+      "learning_rate": 0.00012912530582837682,
+      "loss": 0.9123,
+      "step": 335
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.8403642773628235,
+      "learning_rate": 0.00012873777539848283,
+      "loss": 0.9323,
+      "step": 336
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.1632657051086426,
+      "learning_rate": 0.00012834977395996818,
+      "loss": 1.1916,
+      "step": 337
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.9937611222267151,
+      "learning_rate": 0.0001279613078721289,
+      "loss": 1.141,
+      "step": 338
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.8973978161811829,
+      "learning_rate": 0.0001275723835018767,
+      "loss": 0.8399,
+      "step": 339
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.0466402769088745,
+      "learning_rate": 0.0001271830072236343,
+      "loss": 0.8127,
+      "step": 340
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.9691051244735718,
+      "learning_rate": 0.0001267931854192313,
+      "loss": 0.9794,
+      "step": 341
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.925682544708252,
+      "learning_rate": 0.0001264029244777993,
+      "loss": 0.8233,
+      "step": 342
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.9783278703689575,
+      "learning_rate": 0.00012601223079566743,
+      "loss": 0.9542,
+      "step": 343
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.9945007562637329,
+      "learning_rate": 0.00012562111077625722,
+      "loss": 1.0757,
+      "step": 344
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.1597148180007935,
+      "learning_rate": 0.000125229570829978,
+      "loss": 1.1052,
+      "step": 345
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.7987023591995239,
+      "learning_rate": 0.0001248376173741215,
+      "loss": 0.8602,
+      "step": 346
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.8969370126724243,
+      "learning_rate": 0.00012444525683275688,
+      "loss": 1.6019,
+      "step": 347
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.0622583627700806,
+      "learning_rate": 0.00012405249563662537,
+      "loss": 1.0735,
+      "step": 348
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.0987950563430786,
+      "learning_rate": 0.00012365934022303491,
+      "loss": 0.9973,
+      "step": 349
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.9930221438407898,
+      "learning_rate": 0.00012326579703575462,
+      "loss": 1.1257,
+      "step": 350
+    },
+    {
+      "epoch": 1.74,
+      "eval_loss": 1.1461950540542603,
+      "eval_runtime": 2.9343,
+      "eval_samples_per_second": 34.08,
+      "eval_steps_per_second": 17.04,
+      "step": 350
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.0799540281295776,
+      "learning_rate": 0.00012287187252490913,
+      "loss": 0.8758,
+      "step": 351
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.0633143186569214,
+      "learning_rate": 0.00012247757314687297,
+      "loss": 1.0396,
+      "step": 352
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.9504884481430054,
+      "learning_rate": 0.00012208290536416463,
+      "loss": 0.8192,
+      "step": 353
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.8587303161621094,
+      "learning_rate": 0.00012168787564534078,
+      "loss": 0.748,
+      "step": 354
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.3652898073196411,
+      "learning_rate": 0.0001212924904648902,
+      "loss": 1.0768,
+      "step": 355
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.0679266452789307,
+      "learning_rate": 0.00012089675630312754,
+      "loss": 0.9099,
+      "step": 356
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.2426522970199585,
+      "learning_rate": 0.00012050067964608724,
+      "loss": 0.9869,
+      "step": 357
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.9639490246772766,
+      "learning_rate": 0.00012010426698541728,
+      "loss": 0.6993,
+      "step": 358
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.1884175539016724,
+      "learning_rate": 0.0001197075248182726,
+      "loss": 0.9868,
+      "step": 359
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.9860052466392517,
+      "learning_rate": 0.00011931045964720881,
+      "loss": 0.7148,
+      "step": 360
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.8812693357467651,
+      "learning_rate": 0.00011891307798007536,
+      "loss": 0.9295,
+      "step": 361
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.032242774963379,
+      "learning_rate": 0.00011851538632990921,
+      "loss": 1.2292,
+      "step": 362
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.9777809381484985,
+      "learning_rate": 0.00011811739121482777,
+      "loss": 1.0646,
+      "step": 363
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.0464228391647339,
+      "learning_rate": 0.0001177190991579223,
+      "loss": 0.9703,
+      "step": 364
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.9763212203979492,
+      "learning_rate": 0.00011732051668715081,
+      "loss": 0.7753,
+      "step": 365
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.114912748336792,
+      "learning_rate": 0.00011692165033523117,
+      "loss": 0.9979,
+      "step": 366
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.8752657771110535,
+      "learning_rate": 0.00011652250663953415,
+      "loss": 0.9964,
+      "step": 367
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.9158682823181152,
+      "learning_rate": 0.00011612309214197599,
+      "loss": 0.7576,
+      "step": 368
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.8457457423210144,
+      "learning_rate": 0.00011572341338891144,
+      "loss": 0.9144,
+      "step": 369
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.0021049976348877,
+      "learning_rate": 0.00011532347693102632,
+      "loss": 0.9226,
+      "step": 370
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.9614117741584778,
+      "learning_rate": 0.00011492328932323022,
+      "loss": 1.0214,
+      "step": 371
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.9289172291755676,
+      "learning_rate": 0.00011452285712454904,
+      "loss": 0.8793,
+      "step": 372
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.0654929876327515,
+      "learning_rate": 0.00011412218689801748,
+      "loss": 1.1519,
+      "step": 373
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.0563515424728394,
+      "learning_rate": 0.00011372128521057155,
+      "loss": 0.9859,
+      "step": 374
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.011228322982788,
+      "learning_rate": 0.00011332015863294076,
+      "loss": 0.9138,
+      "step": 375
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.942287802696228,
+      "learning_rate": 0.00011291881373954065,
+      "loss": 0.8865,
+      "step": 376
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.9734610319137573,
+      "learning_rate": 0.00011251725710836489,
+      "loss": 0.8578,
+      "step": 377
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.184990406036377,
+      "learning_rate": 0.00011211549532087749,
+      "loss": 1.0107,
+      "step": 378
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.033831238746643,
+      "learning_rate": 0.00011171353496190498,
+      "loss": 1.0496,
+      "step": 379
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.018054485321045,
+      "learning_rate": 0.00011131138261952845,
+      "loss": 0.8782,
+      "step": 380
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.9694205522537231,
+      "learning_rate": 0.00011090904488497549,
+      "loss": 0.9928,
+      "step": 381
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.9095280170440674,
+      "learning_rate": 0.0001105065283525124,
+      "loss": 0.9821,
+      "step": 382
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.8029172420501709,
+      "learning_rate": 0.00011010383961933581,
+      "loss": 0.6811,
+      "step": 383
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.9388089776039124,
+      "learning_rate": 0.00010970098528546481,
+      "loss": 0.9703,
+      "step": 384
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.8639506697654724,
+      "learning_rate": 0.00010929797195363259,
+      "loss": 0.8579,
+      "step": 385
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.001845121383667,
+      "learning_rate": 0.0001088948062291783,
+      "loss": 1.038,
+      "step": 386
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.9668776392936707,
+      "learning_rate": 0.00010849149471993882,
+      "loss": 0.9457,
+      "step": 387
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.8607358932495117,
+      "learning_rate": 0.00010808804403614043,
+      "loss": 0.8795,
+      "step": 388
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.0189685821533203,
+      "learning_rate": 0.00010768446079029044,
+      "loss": 0.9203,
+      "step": 389
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.9952776432037354,
+      "learning_rate": 0.0001072807515970688,
+      "loss": 1.0368,
+      "step": 390
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.057427167892456,
+      "learning_rate": 0.00010687692307321984,
+      "loss": 1.0568,
+      "step": 391
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.822589099407196,
+      "learning_rate": 0.00010647298183744359,
+      "loss": 0.9598,
+      "step": 392
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.9903733730316162,
+      "learning_rate": 0.00010606893451028743,
+      "loss": 1.0595,
+      "step": 393
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.0125857591629028,
+      "learning_rate": 0.00010566478771403763,
+      "loss": 0.9646,
+      "step": 394
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.899347722530365,
+      "learning_rate": 0.00010526054807261067,
+      "loss": 1.0054,
+      "step": 395
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.0629827976226807,
+      "learning_rate": 0.00010485622221144484,
+      "loss": 0.9319,
+      "step": 396
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.9910023212432861,
+      "learning_rate": 0.00010445181675739144,
+      "loss": 0.9388,
+      "step": 397
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.8644474744796753,
+      "learning_rate": 0.00010404733833860639,
+      "loss": 0.8007,
+      "step": 398
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.9804188013076782,
+      "learning_rate": 0.00010364279358444144,
+      "loss": 0.9715,
+      "step": 399
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.9533838033676147,
+      "learning_rate": 0.00010323818912533561,
+      "loss": 0.9404,
+      "step": 400
+    },
+    {
+      "epoch": 1.99,
+      "eval_loss": 1.1519674062728882,
+      "eval_runtime": 2.9242,
+      "eval_samples_per_second": 34.198,
+      "eval_steps_per_second": 17.099,
+      "step": 400
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 0.9107962250709534,
+      "learning_rate": 0.00010283353159270643,
+      "loss": 0.9431,
+      "step": 401
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.091841220855713,
+      "learning_rate": 0.00010242882761884131,
+      "loss": 0.9116,
+      "step": 402
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 1.0149590969085693,
+      "learning_rate": 0.00010202408383678888,
+      "loss": 0.9091,
+      "step": 403
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 1.1325241327285767,
+      "learning_rate": 0.00010161930688025017,
+      "loss": 1.0873,
+      "step": 404
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 1.0526857376098633,
+      "learning_rate": 0.0001012145033834699,
+      "loss": 0.9299,
+      "step": 405
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 1.074191689491272,
+      "learning_rate": 0.00010080967998112787,
+      "loss": 1.1391,
+      "step": 406
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 1.1719715595245361,
+      "learning_rate": 0.00010040484330823006,
+      "loss": 0.7161,
+      "step": 407
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 0.9399845004081726,
+      "learning_rate": 0.0001,
+      "loss": 0.7587,
+      "step": 408
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 0.8841493725776672,
+      "learning_rate": 9.959515669176996e-05,
+      "loss": 0.6094,
+      "step": 409
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.8972917199134827,
+      "learning_rate": 9.919032001887215e-05,
+      "loss": 0.823,
+      "step": 410
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 1.1433557271957397,
+      "learning_rate": 9.878549661653012e-05,
+      "loss": 0.8466,
+      "step": 411
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 1.080410361289978,
+      "learning_rate": 9.838069311974986e-05,
+      "loss": 0.6281,
+      "step": 412
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 0.9081548452377319,
+      "learning_rate": 9.797591616321114e-05,
+      "loss": 0.7148,
+      "step": 413
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 1.0240696668624878,
+      "learning_rate": 9.757117238115871e-05,
+      "loss": 0.5947,
+      "step": 414
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 1.038631796836853,
+      "learning_rate": 9.716646840729361e-05,
+      "loss": 0.5712,
+      "step": 415
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 1.0166879892349243,
+      "learning_rate": 9.676181087466444e-05,
+      "loss": 0.5389,
+      "step": 416
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 1.2785813808441162,
+      "learning_rate": 9.635720641555858e-05,
+      "loss": 0.6171,
+      "step": 417
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 1.2383880615234375,
+      "learning_rate": 9.595266166139366e-05,
+      "loss": 0.5927,
+      "step": 418
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 1.4389182329177856,
+      "learning_rate": 9.554818324260859e-05,
+      "loss": 0.6568,
+      "step": 419
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 1.3202635049819946,
+      "learning_rate": 9.514377778855521e-05,
+      "loss": 0.7599,
+      "step": 420
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 1.3828835487365723,
+      "learning_rate": 9.473945192738933e-05,
+      "loss": 0.5168,
+      "step": 421
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.9763804078102112,
+      "learning_rate": 9.433521228596237e-05,
+      "loss": 0.4694,
+      "step": 422
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 1.2587525844573975,
+      "learning_rate": 9.393106548971256e-05,
+      "loss": 0.7283,
+      "step": 423
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 0.9997501969337463,
+      "learning_rate": 9.352701816255643e-05,
+      "loss": 0.5682,
+      "step": 424
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 1.4405382871627808,
+      "learning_rate": 9.312307692678017e-05,
+      "loss": 1.1634,
+      "step": 425
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.2368428707122803,
+      "learning_rate": 9.27192484029312e-05,
+      "loss": 0.6116,
+      "step": 426
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.9565535187721252,
+      "learning_rate": 9.231553920970958e-05,
+      "loss": 0.455,
+      "step": 427
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 1.2496212720870972,
+      "learning_rate": 9.19119559638596e-05,
+      "loss": 0.706,
+      "step": 428
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 1.08584725856781,
+      "learning_rate": 9.150850528006119e-05,
+      "loss": 0.6962,
+      "step": 429
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 1.161963939666748,
+      "learning_rate": 9.110519377082172e-05,
+      "loss": 0.5892,
+      "step": 430
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 1.1196906566619873,
+      "learning_rate": 9.070202804636745e-05,
+      "loss": 0.7261,
+      "step": 431
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 1.372056007385254,
+      "learning_rate": 9.02990147145352e-05,
+      "loss": 0.7534,
+      "step": 432
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 1.2965703010559082,
+      "learning_rate": 8.98961603806642e-05,
+      "loss": 0.5013,
+      "step": 433
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 1.2913953065872192,
+      "learning_rate": 8.949347164748762e-05,
+      "loss": 0.7149,
+      "step": 434
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 1.0722301006317139,
+      "learning_rate": 8.909095511502452e-05,
+      "loss": 0.539,
+      "step": 435
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 1.3729982376098633,
+      "learning_rate": 8.868861738047158e-05,
+      "loss": 0.7224,
+      "step": 436
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 1.3238959312438965,
+      "learning_rate": 8.828646503809504e-05,
+      "loss": 0.7495,
+      "step": 437
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 1.4315913915634155,
+      "learning_rate": 8.788450467912255e-05,
+      "loss": 0.5041,
+      "step": 438
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 1.145209789276123,
+      "learning_rate": 8.748274289163514e-05,
+      "loss": 0.6526,
+      "step": 439
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 1.3024333715438843,
+      "learning_rate": 8.70811862604594e-05,
+      "loss": 0.7016,
+      "step": 440
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 1.524943232536316,
+      "learning_rate": 8.667984136705928e-05,
+      "loss": 0.7276,
+      "step": 441
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 1.4063531160354614,
+      "learning_rate": 8.627871478942851e-05,
+      "loss": 0.6246,
+      "step": 442
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 1.2883118391036987,
+      "learning_rate": 8.587781310198255e-05,
+      "loss": 0.7363,
+      "step": 443
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 1.2209841012954712,
+      "learning_rate": 8.5477142875451e-05,
+      "loss": 0.5598,
+      "step": 444
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 0.9916577339172363,
+      "learning_rate": 8.507671067676979e-05,
+      "loss": 0.4323,
+      "step": 445
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 1.306430459022522,
+      "learning_rate": 8.467652306897369e-05,
+      "loss": 0.7043,
+      "step": 446
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 1.0825719833374023,
+      "learning_rate": 8.427658661108857e-05,
+      "loss": 0.634,
+      "step": 447
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 1.1884212493896484,
+      "learning_rate": 8.387690785802402e-05,
+      "loss": 0.7186,
+      "step": 448
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 1.6867362260818481,
+      "learning_rate": 8.347749336046586e-05,
+      "loss": 0.6552,
+      "step": 449
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 1.268347978591919,
+      "learning_rate": 8.307834966476884e-05,
+      "loss": 0.7161,
+      "step": 450
+    },
+    {
+      "epoch": 2.22,
+      "eval_loss": 1.2602972984313965,
+      "eval_runtime": 2.9469,
+      "eval_samples_per_second": 33.934,
+      "eval_steps_per_second": 16.967,
+      "step": 450
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.998717188835144,
+      "learning_rate": 8.267948331284923e-05,
+      "loss": 0.5212,
+      "step": 451
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 1.153731346130371,
+      "learning_rate": 8.228090084207774e-05,
+      "loss": 0.6208,
+      "step": 452
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 1.3108233213424683,
+      "learning_rate": 8.188260878517224e-05,
+      "loss": 0.6973,
+      "step": 453
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 1.1354055404663086,
+      "learning_rate": 8.14846136700908e-05,
+      "loss": 0.4217,
+      "step": 454
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 1.1650023460388184,
+      "learning_rate": 8.108692201992465e-05,
+      "loss": 0.4248,
+      "step": 455
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 1.2203434705734253,
+      "learning_rate": 8.068954035279121e-05,
+      "loss": 0.6691,
+      "step": 456
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 1.2530115842819214,
+      "learning_rate": 8.02924751817274e-05,
+      "loss": 0.6395,
+      "step": 457
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 1.2986165285110474,
+      "learning_rate": 7.989573301458273e-05,
+      "loss": 0.8401,
+      "step": 458
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 1.263421654701233,
+      "learning_rate": 7.949932035391278e-05,
+      "loss": 0.5025,
+      "step": 459
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 1.4409805536270142,
+      "learning_rate": 7.91032436968725e-05,
+      "loss": 0.882,
+      "step": 460
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 1.6700172424316406,
+      "learning_rate": 7.870750953510984e-05,
+      "loss": 0.8917,
+      "step": 461
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 1.1698029041290283,
+      "learning_rate": 7.831212435465924e-05,
+      "loss": 0.664,
+      "step": 462
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 1.5076547861099243,
+      "learning_rate": 7.79170946358354e-05,
+      "loss": 0.8633,
+      "step": 463
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 1.0880191326141357,
+      "learning_rate": 7.75224268531271e-05,
+      "loss": 0.5256,
+      "step": 464
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 1.05411696434021,
+      "learning_rate": 7.71281274750909e-05,
+      "loss": 0.5846,
+      "step": 465
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 1.4615259170532227,
+      "learning_rate": 7.673420296424541e-05,
+      "loss": 0.8497,
+      "step": 466
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 1.4441969394683838,
+      "learning_rate": 7.634065977696511e-05,
+      "loss": 0.7554,
+      "step": 467
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 1.2453029155731201,
+      "learning_rate": 7.594750436337467e-05,
+      "loss": 0.6189,
+      "step": 468
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 1.3973779678344727,
+      "learning_rate": 7.555474316724313e-05,
+      "loss": 0.7063,
+      "step": 469
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 1.1818283796310425,
+      "learning_rate": 7.516238262587851e-05,
+      "loss": 0.6328,
+      "step": 470
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 1.1389139890670776,
+      "learning_rate": 7.4770429170022e-05,
+      "loss": 0.648,
+      "step": 471
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 1.4820585250854492,
+      "learning_rate": 7.437888922374276e-05,
+      "loss": 0.7222,
+      "step": 472
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 1.3325060606002808,
+      "learning_rate": 7.398776920433258e-05,
+      "loss": 0.6432,
+      "step": 473
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 1.0379180908203125,
+      "learning_rate": 7.35970755222007e-05,
+      "loss": 0.3837,
+      "step": 474
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 1.351940631866455,
+      "learning_rate": 7.320681458076871e-05,
+      "loss": 0.6917,
+      "step": 475
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 1.2660441398620605,
+      "learning_rate": 7.281699277636572e-05,
+      "loss": 0.6345,
+      "step": 476
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 1.4925462007522583,
+      "learning_rate": 7.242761649812335e-05,
+      "loss": 0.4858,
+      "step": 477
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 1.315204381942749,
+      "learning_rate": 7.20386921278711e-05,
+      "loss": 0.7035,
+      "step": 478
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 1.4045330286026,
+      "learning_rate": 7.165022604003186e-05,
+      "loss": 0.809,
+      "step": 479
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 1.2216744422912598,
+      "learning_rate": 7.126222460151719e-05,
+      "loss": 0.584,
+      "step": 480
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 1.272883415222168,
+      "learning_rate": 7.08746941716232e-05,
+      "loss": 0.613,
+      "step": 481
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 1.1015321016311646,
+      "learning_rate": 7.048764110192618e-05,
+      "loss": 0.4539,
+      "step": 482
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 1.173862099647522,
+      "learning_rate": 7.010107173617857e-05,
+      "loss": 0.6842,
+      "step": 483
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 1.3101396560668945,
+      "learning_rate": 6.971499241020495e-05,
+      "loss": 0.6377,
+      "step": 484
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 1.1513952016830444,
+      "learning_rate": 6.932940945179818e-05,
+      "loss": 0.502,
+      "step": 485
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.2137222290039062,
+      "learning_rate": 6.894432918061579e-05,
+      "loss": 0.6232,
+      "step": 486
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.0849742889404297,
+      "learning_rate": 6.855975790807623e-05,
+      "loss": 0.4799,
+      "step": 487
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 1.1737949848175049,
+      "learning_rate": 6.817570193725564e-05,
+      "loss": 0.5119,
+      "step": 488
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 1.3420112133026123,
+      "learning_rate": 6.77921675627843e-05,
+      "loss": 0.7176,
+      "step": 489
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 1.3262616395950317,
+      "learning_rate": 6.740916107074372e-05,
+      "loss": 0.7479,
+      "step": 490
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 1.3177785873413086,
+      "learning_rate": 6.702668873856338e-05,
+      "loss": 0.6498,
+      "step": 491
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 1.3273133039474487,
+      "learning_rate": 6.664475683491796e-05,
+      "loss": 0.6036,
+      "step": 492
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 1.1320433616638184,
+      "learning_rate": 6.626337161962461e-05,
+      "loss": 0.5075,
+      "step": 493
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 1.2999693155288696,
+      "learning_rate": 6.588253934354039e-05,
+      "loss": 0.5805,
+      "step": 494
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 1.2638920545578003,
+      "learning_rate": 6.550226624845961e-05,
+      "loss": 0.6831,
+      "step": 495
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 1.246358871459961,
+      "learning_rate": 6.512255856701177e-05,
+      "loss": 0.5891,
+      "step": 496
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 1.216341257095337,
+      "learning_rate": 6.474342252255927e-05,
+      "loss": 0.6533,
+      "step": 497
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 1.4384123086929321,
+      "learning_rate": 6.43648643290955e-05,
+      "loss": 0.7545,
+      "step": 498
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 1.2650271654129028,
+      "learning_rate": 6.398689019114289e-05,
+      "loss": 0.7225,
+      "step": 499
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 1.2374640703201294,
+      "learning_rate": 6.360950630365126e-05,
+      "loss": 0.5897,
+      "step": 500
+    },
+    {
+      "epoch": 2.47,
+      "eval_loss": 1.2661257982254028,
+      "eval_runtime": 2.939,
+      "eval_samples_per_second": 34.025,
+      "eval_steps_per_second": 17.013,
+      "step": 500
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 1.6078161001205444,
+      "learning_rate": 6.323271885189635e-05,
+      "loss": 0.5883,
+      "step": 501
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.2864457368850708,
+      "learning_rate": 6.285653401137837e-05,
+      "loss": 0.6071,
+      "step": 502
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.3440560102462769,
+      "learning_rate": 6.248095794772079e-05,
+      "loss": 0.7475,
+      "step": 503
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 1.1603760719299316,
+      "learning_rate": 6.210599681656933e-05,
+      "loss": 0.6603,
+      "step": 504
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 1.2979274988174438,
+      "learning_rate": 6.173165676349103e-05,
+      "loss": 0.6754,
+      "step": 505
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 1.1852139234542847,
+      "learning_rate": 6.135794392387353e-05,
+      "loss": 0.6516,
+      "step": 506
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 1.2540236711502075,
+      "learning_rate": 6.0984864422824496e-05,
+      "loss": 0.5239,
+      "step": 507
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 1.5349066257476807,
+      "learning_rate": 6.061242437507131e-05,
+      "loss": 0.5365,
+      "step": 508
+    },
+    {
+      "epoch": 2.51,
+      "grad_norm": 1.258698582649231,
+      "learning_rate": 6.024062988486072e-05,
+      "loss": 0.7099,
+      "step": 509
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 1.2852509021759033,
+      "learning_rate": 5.986948704585895e-05,
+      "loss": 0.4977,
+      "step": 510
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 1.3446050882339478,
+      "learning_rate": 5.949900194105167e-05,
+      "loss": 0.6753,
+      "step": 511
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 1.2775357961654663,
+      "learning_rate": 5.9129180642644414e-05,
+      "loss": 0.5833,
+      "step": 512
+    },
+    {
+      "epoch": 2.53,
+      "grad_norm": 1.2563271522521973,
+      "learning_rate": 5.8760029211962954e-05,
+      "loss": 0.5167,
+      "step": 513
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 1.1414707899093628,
+      "learning_rate": 5.839155369935407e-05,
+      "loss": 0.6838,
+      "step": 514
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 1.5055315494537354,
+      "learning_rate": 5.802376014408632e-05,
+      "loss": 0.672,
+      "step": 515
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 1.3252393007278442,
+      "learning_rate": 5.765665457425102e-05,
+      "loss": 0.7256,
+      "step": 516
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 1.1352269649505615,
+      "learning_rate": 5.729024300666349e-05,
+      "loss": 0.5319,
+      "step": 517
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 1.3474462032318115,
+      "learning_rate": 5.6924531446764504e-05,
+      "loss": 0.5271,
+      "step": 518
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 1.1647965908050537,
+      "learning_rate": 5.6559525888521815e-05,
+      "loss": 0.5496,
+      "step": 519
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 1.3941562175750732,
+      "learning_rate": 5.6195232314331766e-05,
+      "loss": 0.5551,
+      "step": 520
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 1.4575625658035278,
+      "learning_rate": 5.5831656694921465e-05,
+      "loss": 0.6719,
+      "step": 521
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 1.2469514608383179,
+      "learning_rate": 5.5468804989250786e-05,
+      "loss": 0.6593,
+      "step": 522
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 1.3567513227462769,
+      "learning_rate": 5.510668314441474e-05,
+      "loss": 0.5666,
+      "step": 523
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 1.294553279876709,
+      "learning_rate": 5.474529709554612e-05,
+      "loss": 0.6345,
+      "step": 524
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 1.0715196132659912,
+      "learning_rate": 5.438465276571796e-05,
+      "loss": 0.401,
+      "step": 525
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 1.3244280815124512,
+      "learning_rate": 5.402475606584669e-05,
+      "loss": 0.6757,
+      "step": 526
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 1.2300620079040527,
+      "learning_rate": 5.366561289459512e-05,
+      "loss": 0.7366,
+      "step": 527
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 1.0702522993087769,
+      "learning_rate": 5.3307229138275936e-05,
+      "loss": 0.4266,
+      "step": 528
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 1.2548829317092896,
+      "learning_rate": 5.2949610670755e-05,
+      "loss": 0.8007,
+      "step": 529
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 1.3317065238952637,
+      "learning_rate": 5.259276335335521e-05,
+      "loss": 0.6294,
+      "step": 530
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 1.2665762901306152,
+      "learning_rate": 5.223669303476041e-05,
+      "loss": 0.514,
+      "step": 531
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 1.2807515859603882,
+      "learning_rate": 5.1881405550919493e-05,
+      "loss": 0.6037,
+      "step": 532
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 1.541114330291748,
+      "learning_rate": 5.152690672495091e-05,
+      "loss": 0.6603,
+      "step": 533
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 1.299155831336975,
+      "learning_rate": 5.117320236704697e-05,
+      "loss": 0.5944,
+      "step": 534
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 1.4731584787368774,
+      "learning_rate": 5.08202982743788e-05,
+      "loss": 0.7807,
+      "step": 535
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 1.2540079355239868,
+      "learning_rate": 5.0468200231001286e-05,
+      "loss": 0.6043,
+      "step": 536
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 1.2161357402801514,
+      "learning_rate": 5.01169140077582e-05,
+      "loss": 0.5168,
+      "step": 537
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 1.5545177459716797,
+      "learning_rate": 4.976644536218783e-05,
+      "loss": 0.5285,
+      "step": 538
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 1.5177483558654785,
+      "learning_rate": 4.9416800038428324e-05,
+      "loss": 0.6826,
+      "step": 539
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 1.369188666343689,
+      "learning_rate": 4.9067983767123736e-05,
+      "loss": 0.7984,
+      "step": 540
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 1.6426352262496948,
+      "learning_rate": 4.8720002265330015e-05,
+      "loss": 0.6126,
+      "step": 541
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 1.1411386728286743,
+      "learning_rate": 4.837286123642141e-05,
+      "loss": 0.6635,
+      "step": 542
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 1.2911747694015503,
+      "learning_rate": 4.8026566369996926e-05,
+      "loss": 0.4522,
+      "step": 543
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 1.55097496509552,
+      "learning_rate": 4.768112334178699e-05,
+      "loss": 0.8282,
+      "step": 544
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 1.736786961555481,
+      "learning_rate": 4.733653781356055e-05,
+      "loss": 0.6144,
+      "step": 545
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 1.2241405248641968,
+      "learning_rate": 4.699281543303222e-05,
+      "loss": 0.5656,
+      "step": 546
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 1.30910325050354,
+      "learning_rate": 4.6649961833769715e-05,
+      "loss": 0.5732,
+      "step": 547
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 1.780985713005066,
+      "learning_rate": 4.630798263510162e-05,
+      "loss": 0.5689,
+      "step": 548
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 1.4643489122390747,
+      "learning_rate": 4.596688344202509e-05,
+      "loss": 0.633,
+      "step": 549
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 1.273721694946289,
+      "learning_rate": 4.562666984511416e-05,
+      "loss": 0.5271,
+      "step": 550
+    },
+    {
+      "epoch": 2.72,
+      "eval_loss": 1.281378149986267,
+      "eval_runtime": 3.0044,
+      "eval_samples_per_second": 33.285,
+      "eval_steps_per_second": 16.642,
+      "step": 550
+    },
+    {
+      "epoch": 2.72,
+      "grad_norm": 1.3252663612365723,
+      "learning_rate": 4.528734742042803e-05,
+      "loss": 0.4885,
+      "step": 551
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 1.159148097038269,
+      "learning_rate": 4.494892172941965e-05,
+      "loss": 0.4881,
+      "step": 552
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 1.4068233966827393,
+      "learning_rate": 4.461139831884474e-05,
+      "loss": 0.6787,
+      "step": 553
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 1.279906153678894,
+      "learning_rate": 4.427478272067066e-05,
+      "loss": 0.5426,
+      "step": 554
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 1.1998430490493774,
+      "learning_rate": 4.393908045198585e-05,
+      "loss": 0.5433,
+      "step": 555
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 1.3037670850753784,
+      "learning_rate": 4.360429701490934e-05,
+      "loss": 0.5773,
+      "step": 556
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 1.260678768157959,
+      "learning_rate": 4.327043789650078e-05,
+      "loss": 0.4421,
+      "step": 557
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.9158841967582703,
+      "learning_rate": 4.2937508568670194e-05,
+      "loss": 0.4472,
+      "step": 558
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 1.4653347730636597,
+      "learning_rate": 4.2605514488088515e-05,
+      "loss": 0.7012,
+      "step": 559
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 1.3992079496383667,
+      "learning_rate": 4.227446109609809e-05,
+      "loss": 0.5479,
+      "step": 560
+    },
+    {
+      "epoch": 2.77,
+      "grad_norm": 1.353127360343933,
+      "learning_rate": 4.1944353818623424e-05,
+      "loss": 0.7026,
+      "step": 561
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 1.393446683883667,
+      "learning_rate": 4.161519806608247e-05,
+      "loss": 0.5551,
+      "step": 562
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 1.1399997472763062,
+      "learning_rate": 4.12869992332977e-05,
+      "loss": 0.3954,
+      "step": 563
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 1.5926597118377686,
+      "learning_rate": 4.0959762699407766e-05,
+      "loss": 0.6895,
+      "step": 564
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 1.2445831298828125,
+      "learning_rate": 4.0633493827779425e-05,
+      "loss": 0.5555,
+      "step": 565
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.3105766773223877,
+      "learning_rate": 4.030819796591949e-05,
+      "loss": 0.7623,
+      "step": 566
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.4394389390945435,
+      "learning_rate": 3.9983880445387366e-05,
+      "loss": 0.6299,
+      "step": 567
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 1.4189199209213257,
+      "learning_rate": 3.966054658170754e-05,
+      "loss": 0.601,
+      "step": 568
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 1.0969223976135254,
+      "learning_rate": 3.9338201674282406e-05,
+      "loss": 0.3905,
+      "step": 569
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 1.2229801416397095,
+      "learning_rate": 3.9016851006305545e-05,
+      "loss": 0.5059,
+      "step": 570
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 1.474869966506958,
+      "learning_rate": 3.869649984467504e-05,
+      "loss": 0.6408,
+      "step": 571
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 1.5417041778564453,
+      "learning_rate": 3.8377153439907266e-05,
+      "loss": 0.6754,
+      "step": 572
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 1.4375914335250854,
+      "learning_rate": 3.8058817026050677e-05,
+      "loss": 0.681,
+      "step": 573
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 1.1246694326400757,
+      "learning_rate": 3.774149582060012e-05,
+      "loss": 0.5772,
+      "step": 574
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 1.3641024827957153,
+      "learning_rate": 3.742519502441132e-05,
+      "loss": 0.7361,
+      "step": 575
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.533368706703186,
+      "learning_rate": 3.710991982161555e-05,
+      "loss": 0.5878,
+      "step": 576
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 1.2794849872589111,
+      "learning_rate": 3.679567537953485e-05,
+      "loss": 0.5081,
+      "step": 577
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 1.4266434907913208,
+      "learning_rate": 3.648246684859716e-05,
+      "loss": 0.7266,
+      "step": 578
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 1.4641722440719604,
+      "learning_rate": 3.617029936225193e-05,
+      "loss": 0.6243,
+      "step": 579
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 1.2470731735229492,
+      "learning_rate": 3.585917803688603e-05,
+      "loss": 0.5591,
+      "step": 580
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 1.4247914552688599,
+      "learning_rate": 3.55491079717399e-05,
+      "loss": 0.6843,
+      "step": 581
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 1.4032796621322632,
+      "learning_rate": 3.5240094248824e-05,
+      "loss": 0.6464,
+      "step": 582
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 2.002753257751465,
+      "learning_rate": 3.493214193283536e-05,
+      "loss": 0.5833,
+      "step": 583
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 1.3017632961273193,
+      "learning_rate": 3.4625256071074773e-05,
+      "loss": 0.7035,
+      "step": 584
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 1.2375082969665527,
+      "learning_rate": 3.4319441693363906e-05,
+      "loss": 0.6327,
+      "step": 585
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 1.4240336418151855,
+      "learning_rate": 3.4014703811963025e-05,
+      "loss": 0.7169,
+      "step": 586
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 1.3496601581573486,
+      "learning_rate": 3.3711047421488675e-05,
+      "loss": 0.5654,
+      "step": 587
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 1.3548609018325806,
+      "learning_rate": 3.340847749883191e-05,
+      "loss": 0.5809,
+      "step": 588
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 1.3450309038162231,
+      "learning_rate": 3.3106999003076746e-05,
+      "loss": 0.7006,
+      "step": 589
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 1.364551305770874,
+      "learning_rate": 3.280661687541876e-05,
+      "loss": 0.5856,
+      "step": 590
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 1.390633225440979,
+      "learning_rate": 3.2507336039084314e-05,
+      "loss": 0.58,
+      "step": 591
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 1.3074802160263062,
+      "learning_rate": 3.2209161399249674e-05,
+      "loss": 0.5153,
+      "step": 592
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 1.4046270847320557,
+      "learning_rate": 3.191209784296068e-05,
+      "loss": 0.7275,
+      "step": 593
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 1.3784432411193848,
+      "learning_rate": 3.161615023905265e-05,
+      "loss": 0.7039,
+      "step": 594
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 1.3270719051361084,
+      "learning_rate": 3.132132343807056e-05,
+      "loss": 0.7857,
+      "step": 595
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 1.4376825094223022,
+      "learning_rate": 3.102762227218957e-05,
+      "loss": 0.7024,
+      "step": 596
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 1.263853907585144,
+      "learning_rate": 3.073505155513591e-05,
+      "loss": 0.6328,
+      "step": 597
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 796,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 199,
+  "total_flos": 2.301389698719744e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}