diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-1000/trainer_state.json"
@@ -0,0 +1,7021 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1000.0,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.33083319664001465,
+      "learning_rate": 2e-05,
+      "loss": 1.2008,
+      "step": 1
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.3303728997707367,
+      "learning_rate": 4e-05,
+      "loss": 1.2064,
+      "step": 2
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.3344730734825134,
+      "learning_rate": 6e-05,
+      "loss": 1.1978,
+      "step": 3
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.3437633514404297,
+      "learning_rate": 8e-05,
+      "loss": 1.1916,
+      "step": 4
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.35260695219039917,
+      "learning_rate": 0.0001,
+      "loss": 1.1857,
+      "step": 5
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.35693618655204773,
+      "learning_rate": 0.00012,
+      "loss": 1.1802,
+      "step": 6
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 0.370437353849411,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 1.1727,
+      "step": 7
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.4025161564350128,
+      "learning_rate": 0.00016,
+      "loss": 1.1517,
+      "step": 8
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 0.4241160750389099,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 1.1291,
+      "step": 9
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.4702780544757843,
+      "learning_rate": 0.0002,
+      "loss": 1.1016,
+      "step": 10
+    },
+    {
+      "epoch": 11.0,
+      "grad_norm": 0.4980923533439636,
+      "learning_rate": 0.00022,
+      "loss": 1.0659,
+      "step": 11
+    },
+    {
+      "epoch": 12.0,
+      "grad_norm": 0.49099475145339966,
+      "learning_rate": 0.00024,
+      "loss": 1.0292,
+      "step": 12
+    },
+    {
+      "epoch": 13.0,
+      "grad_norm": 0.3541075587272644,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.9999,
+      "step": 13
+    },
+    {
+      "epoch": 14.0,
+      "grad_norm": 0.5386999845504761,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.9961,
+      "step": 14
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.7703613042831421,
+      "learning_rate": 0.0003,
+      "loss": 0.9971,
+      "step": 15
+    },
+    {
+      "epoch": 16.0,
+      "grad_norm": 0.76753169298172,
+      "learning_rate": 0.00032,
+      "loss": 0.9816,
+      "step": 16
+    },
+    {
+      "epoch": 17.0,
+      "grad_norm": 0.6905381679534912,
+      "learning_rate": 0.00034,
+      "loss": 0.9583,
+      "step": 17
+    },
+    {
+      "epoch": 18.0,
+      "grad_norm": 0.47108712792396545,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.9283,
+      "step": 18
+    },
+    {
+      "epoch": 19.0,
+      "grad_norm": 0.23705267906188965,
+      "learning_rate": 0.00038,
+      "loss": 0.9124,
+      "step": 19
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.40022554993629456,
+      "learning_rate": 0.0004,
+      "loss": 0.9014,
+      "step": 20
+    },
+    {
+      "epoch": 21.0,
+      "grad_norm": 0.46119189262390137,
+      "learning_rate": 0.00042,
+      "loss": 0.9008,
+      "step": 21
+    },
+    {
+      "epoch": 22.0,
+      "grad_norm": 0.3977269232273102,
+      "learning_rate": 0.00044,
+      "loss": 0.8831,
+      "step": 22
+    },
+    {
+      "epoch": 23.0,
+      "grad_norm": 0.317619651556015,
+      "learning_rate": 0.00046,
+      "loss": 0.8675,
+      "step": 23
+    },
+    {
+      "epoch": 24.0,
+      "grad_norm": 0.21384184062480927,
+      "learning_rate": 0.00048,
+      "loss": 0.8475,
+      "step": 24
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.2396264374256134,
+      "learning_rate": 0.0005,
+      "loss": 0.8411,
+      "step": 25
+    },
+    {
+      "epoch": 26.0,
+      "grad_norm": 0.32538554072380066,
+      "learning_rate": 0.0005200000000000001,
+      "loss": 0.8369,
+      "step": 26
+    },
+    {
+      "epoch": 27.0,
+      "grad_norm": 0.30055898427963257,
+      "learning_rate": 0.00054,
+      "loss": 0.8208,
+      "step": 27
+    },
+    {
+      "epoch": 28.0,
+      "grad_norm": 0.24997705221176147,
+      "learning_rate": 0.0005600000000000001,
+      "loss": 0.8115,
+      "step": 28
+    },
+    {
+      "epoch": 29.0,
+      "grad_norm": 0.17621903121471405,
+      "learning_rate": 0.00058,
+      "loss": 0.7968,
+      "step": 29
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 0.2194773107767105,
+      "learning_rate": 0.0006,
+      "loss": 0.7909,
+      "step": 30
+    },
+    {
+      "epoch": 31.0,
+      "grad_norm": 0.2789114713668823,
+      "learning_rate": 0.00062,
+      "loss": 0.7807,
+      "step": 31
+    },
+    {
+      "epoch": 32.0,
+      "grad_norm": 0.2547268569469452,
+      "learning_rate": 0.00064,
+      "loss": 0.7746,
+      "step": 32
+    },
+    {
+      "epoch": 33.0,
+      "grad_norm": 0.1717722862958908,
+      "learning_rate": 0.00066,
+      "loss": 0.7638,
+      "step": 33
+    },
+    {
+      "epoch": 34.0,
+      "grad_norm": 0.1685272604227066,
+      "learning_rate": 0.00068,
+      "loss": 0.7537,
+      "step": 34
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 0.1899716705083847,
+      "learning_rate": 0.0007,
+      "loss": 0.7477,
+      "step": 35
+    },
+    {
+      "epoch": 36.0,
+      "grad_norm": 0.17042605578899384,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 0.7332,
+      "step": 36
+    },
+    {
+      "epoch": 37.0,
+      "grad_norm": 0.12942233681678772,
+      "learning_rate": 0.00074,
+      "loss": 0.7308,
+      "step": 37
+    },
+    {
+      "epoch": 38.0,
+      "grad_norm": 0.17319905757904053,
+      "learning_rate": 0.00076,
+      "loss": 0.7168,
+      "step": 38
+    },
+    {
+      "epoch": 39.0,
+      "grad_norm": 0.2035578042268753,
+      "learning_rate": 0.0007800000000000001,
+      "loss": 0.7101,
+      "step": 39
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.12170140445232391,
+      "learning_rate": 0.0008,
+      "loss": 0.6984,
+      "step": 40
+    },
+    {
+      "epoch": 41.0,
+      "grad_norm": 0.15441377460956573,
+      "learning_rate": 0.00082,
+      "loss": 0.689,
+      "step": 41
+    },
+    {
+      "epoch": 42.0,
+      "grad_norm": 0.2036806046962738,
+      "learning_rate": 0.00084,
+      "loss": 0.6795,
+      "step": 42
+    },
+    {
+      "epoch": 43.0,
+      "grad_norm": 0.1200585812330246,
+      "learning_rate": 0.00086,
+      "loss": 0.674,
+      "step": 43
+    },
+    {
+      "epoch": 44.0,
+      "grad_norm": 0.17891642451286316,
+      "learning_rate": 0.00088,
+      "loss": 0.6666,
+      "step": 44
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 0.15115144848823547,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 0.6556,
+      "step": 45
+    },
+    {
+      "epoch": 46.0,
+      "grad_norm": 0.1360883265733719,
+      "learning_rate": 0.00092,
+      "loss": 0.6455,
+      "step": 46
+    },
+    {
+      "epoch": 47.0,
+      "grad_norm": 0.15223605930805206,
+      "learning_rate": 0.00094,
+      "loss": 0.6417,
+      "step": 47
+    },
+    {
+      "epoch": 48.0,
+      "grad_norm": 0.1136261597275734,
+      "learning_rate": 0.00096,
+      "loss": 0.6329,
+      "step": 48
+    },
+    {
+      "epoch": 49.0,
+      "grad_norm": 0.14590047299861908,
+      "learning_rate": 0.00098,
+      "loss": 0.6202,
+      "step": 49
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 0.1152251586318016,
+      "learning_rate": 0.001,
+      "loss": 0.61,
+      "step": 50
+    },
+    {
+      "epoch": 51.0,
+      "grad_norm": 0.13655897974967957,
+      "learning_rate": 0.00102,
+      "loss": 0.5998,
+      "step": 51
+    },
+    {
+      "epoch": 52.0,
+      "grad_norm": 0.12263485044240952,
+      "learning_rate": 0.0010400000000000001,
+      "loss": 0.5932,
+      "step": 52
+    },
+    {
+      "epoch": 53.0,
+      "grad_norm": 0.13639847934246063,
+      "learning_rate": 0.0010600000000000002,
+      "loss": 0.5835,
+      "step": 53
+    },
+    {
+      "epoch": 54.0,
+      "grad_norm": 0.11983615159988403,
+      "learning_rate": 0.00108,
+      "loss": 0.5726,
+      "step": 54
+    },
+    {
+      "epoch": 55.0,
+      "grad_norm": 0.10973749309778214,
+      "learning_rate": 0.0011,
+      "loss": 0.5625,
+      "step": 55
+    },
+    {
+      "epoch": 56.0,
+      "grad_norm": 0.12474089860916138,
+      "learning_rate": 0.0011200000000000001,
+      "loss": 0.5522,
+      "step": 56
+    },
+    {
+      "epoch": 57.0,
+      "grad_norm": 0.13365118205547333,
+      "learning_rate": 0.00114,
+      "loss": 0.5432,
+      "step": 57
+    },
+    {
+      "epoch": 58.0,
+      "grad_norm": 0.11922751367092133,
+      "learning_rate": 0.00116,
+      "loss": 0.5289,
+      "step": 58
+    },
+    {
+      "epoch": 59.0,
+      "grad_norm": 0.12625685334205627,
+      "learning_rate": 0.00118,
+      "loss": 0.5248,
+      "step": 59
+    },
+    {
+      "epoch": 60.0,
+      "grad_norm": 0.16271014511585236,
+      "learning_rate": 0.0012,
+      "loss": 0.5091,
+      "step": 60
+    },
+    {
+      "epoch": 61.0,
+      "grad_norm": 0.17841878533363342,
+      "learning_rate": 0.00122,
+      "loss": 0.4991,
+      "step": 61
+    },
+    {
+      "epoch": 62.0,
+      "grad_norm": 0.22032824158668518,
+      "learning_rate": 0.00124,
+      "loss": 0.4945,
+      "step": 62
+    },
+    {
+      "epoch": 63.0,
+      "grad_norm": 0.22950832545757294,
+      "learning_rate": 0.00126,
+      "loss": 0.4829,
+      "step": 63
+    },
+    {
+      "epoch": 64.0,
+      "grad_norm": 0.26760056614875793,
+      "learning_rate": 0.00128,
+      "loss": 0.4725,
+      "step": 64
+    },
+    {
+      "epoch": 65.0,
+      "grad_norm": 0.2522161602973938,
+      "learning_rate": 0.0013000000000000002,
+      "loss": 0.4595,
+      "step": 65
+    },
+    {
+      "epoch": 66.0,
+      "grad_norm": 0.24825534224510193,
+      "learning_rate": 0.00132,
+      "loss": 0.4498,
+      "step": 66
+    },
+    {
+      "epoch": 67.0,
+      "grad_norm": 0.192153200507164,
+      "learning_rate": 0.00134,
+      "loss": 0.4397,
+      "step": 67
+    },
+    {
+      "epoch": 68.0,
+      "grad_norm": 0.15480364859104156,
+      "learning_rate": 0.00136,
+      "loss": 0.4315,
+      "step": 68
+    },
+    {
+      "epoch": 69.0,
+      "grad_norm": 0.1302233189344406,
+      "learning_rate": 0.00138,
+      "loss": 0.4166,
+      "step": 69
+    },
+    {
+      "epoch": 70.0,
+      "grad_norm": 0.12858335673809052,
+      "learning_rate": 0.0014,
+      "loss": 0.4061,
+      "step": 70
+    },
+    {
+      "epoch": 71.0,
+      "grad_norm": 0.14819584786891937,
+      "learning_rate": 0.00142,
+      "loss": 0.3977,
+      "step": 71
+    },
+    {
+      "epoch": 72.0,
+      "grad_norm": 0.20153498649597168,
+      "learning_rate": 0.0014399999999999999,
+      "loss": 0.391,
+      "step": 72
+    },
+    {
+      "epoch": 73.0,
+      "grad_norm": 0.22449156641960144,
+      "learning_rate": 0.00146,
+      "loss": 0.3788,
+      "step": 73
+    },
+    {
+      "epoch": 74.0,
+      "grad_norm": 0.20828841626644135,
+      "learning_rate": 0.00148,
+      "loss": 0.3685,
+      "step": 74
+    },
+    {
+      "epoch": 75.0,
+      "grad_norm": 0.20556601881980896,
+      "learning_rate": 0.0015,
+      "loss": 0.3578,
+      "step": 75
+    },
+    {
+      "epoch": 76.0,
+      "grad_norm": 0.14805461466312408,
+      "learning_rate": 0.00152,
+      "loss": 0.3496,
+      "step": 76
+    },
+    {
+      "epoch": 77.0,
+      "grad_norm": 0.1653750240802765,
+      "learning_rate": 0.0015400000000000001,
+      "loss": 0.3421,
+      "step": 77
+    },
+    {
+      "epoch": 78.0,
+      "grad_norm": 0.15252411365509033,
+      "learning_rate": 0.0015600000000000002,
+      "loss": 0.3307,
+      "step": 78
+    },
+    {
+      "epoch": 79.0,
+      "grad_norm": 0.17012618482112885,
+      "learning_rate": 0.00158,
+      "loss": 0.3219,
+      "step": 79
+    },
+    {
+      "epoch": 80.0,
+      "grad_norm": 0.22152607142925262,
+      "learning_rate": 0.0016,
+      "loss": 0.3131,
+      "step": 80
+    },
+    {
+      "epoch": 81.0,
+      "grad_norm": 0.33657050132751465,
+      "learning_rate": 0.0016200000000000001,
+      "loss": 0.3072,
+      "step": 81
+    },
+    {
+      "epoch": 82.0,
+      "grad_norm": 0.6131005883216858,
+      "learning_rate": 0.00164,
+      "loss": 0.3065,
+      "step": 82
+    },
+    {
+      "epoch": 83.0,
+      "grad_norm": 0.8047762513160706,
+      "learning_rate": 0.00166,
+      "loss": 0.3261,
+      "step": 83
+    },
+    {
+      "epoch": 84.0,
+      "grad_norm": 0.2988828420639038,
+      "learning_rate": 0.00168,
+      "loss": 0.2857,
+      "step": 84
+    },
+    {
+      "epoch": 85.0,
+      "grad_norm": 0.4759994149208069,
+      "learning_rate": 0.0017,
+      "loss": 0.2885,
+      "step": 85
+    },
+    {
+      "epoch": 86.0,
+      "grad_norm": 0.29689377546310425,
+      "learning_rate": 0.00172,
+      "loss": 0.2734,
+      "step": 86
+    },
+    {
+      "epoch": 87.0,
+      "grad_norm": 0.3681173026561737,
+      "learning_rate": 0.00174,
+      "loss": 0.2711,
+      "step": 87
+    },
+    {
+      "epoch": 88.0,
+      "grad_norm": 0.23025013506412506,
+      "learning_rate": 0.00176,
+      "loss": 0.2591,
+      "step": 88
+    },
+    {
+      "epoch": 89.0,
+      "grad_norm": 0.3065023124217987,
+      "learning_rate": 0.0017800000000000001,
+      "loss": 0.2552,
+      "step": 89
+    },
+    {
+      "epoch": 90.0,
+      "grad_norm": 0.20948432385921478,
+      "learning_rate": 0.0018000000000000002,
+      "loss": 0.2454,
+      "step": 90
+    },
+    {
+      "epoch": 91.0,
+      "grad_norm": 0.303070068359375,
+      "learning_rate": 0.00182,
+      "loss": 0.2402,
+      "step": 91
+    },
+    {
+      "epoch": 92.0,
+      "grad_norm": 0.17348457872867584,
+      "learning_rate": 0.00184,
+      "loss": 0.2282,
+      "step": 92
+    },
+    {
+      "epoch": 93.0,
+      "grad_norm": 0.2221260964870453,
+      "learning_rate": 0.00186,
+      "loss": 0.227,
+      "step": 93
+    },
+    {
+      "epoch": 94.0,
+      "grad_norm": 0.2136717438697815,
+      "learning_rate": 0.00188,
+      "loss": 0.217,
+      "step": 94
+    },
+    {
+      "epoch": 95.0,
+      "grad_norm": 0.18598757684230804,
+      "learning_rate": 0.0019,
+      "loss": 0.2093,
+      "step": 95
+    },
+    {
+      "epoch": 96.0,
+      "grad_norm": 0.2381504476070404,
+      "learning_rate": 0.00192,
+      "loss": 0.2075,
+      "step": 96
+    },
+    {
+      "epoch": 97.0,
+      "grad_norm": 0.16604942083358765,
+      "learning_rate": 0.0019399999999999999,
+      "loss": 0.1968,
+      "step": 97
+    },
+    {
+      "epoch": 98.0,
+      "grad_norm": 0.1756654977798462,
+      "learning_rate": 0.00196,
+      "loss": 0.1914,
+      "step": 98
+    },
+    {
+      "epoch": 99.0,
+      "grad_norm": 0.18636329472064972,
+      "learning_rate": 0.00198,
+      "loss": 0.1871,
+      "step": 99
+    },
+    {
+      "epoch": 100.0,
+      "grad_norm": 0.15465092658996582,
+      "learning_rate": 0.002,
+      "loss": 0.1811,
+      "step": 100
+    },
+    {
+      "epoch": 101.0,
+      "grad_norm": 0.17197373509407043,
+      "learning_rate": 0.001997777777777778,
+      "loss": 0.1776,
+      "step": 101
+    },
+    {
+      "epoch": 102.0,
+      "grad_norm": 0.1587797999382019,
+      "learning_rate": 0.0019955555555555555,
+      "loss": 0.1711,
+      "step": 102
+    },
+    {
+      "epoch": 103.0,
+      "grad_norm": 0.16476012766361237,
+      "learning_rate": 0.0019933333333333335,
+      "loss": 0.1672,
+      "step": 103
+    },
+    {
+      "epoch": 104.0,
+      "grad_norm": 0.15602949261665344,
+      "learning_rate": 0.001991111111111111,
+      "loss": 0.1614,
+      "step": 104
+    },
+    {
+      "epoch": 105.0,
+      "grad_norm": 0.14582307636737823,
+      "learning_rate": 0.001988888888888889,
+      "loss": 0.1538,
+      "step": 105
+    },
+    {
+      "epoch": 106.0,
+      "grad_norm": 0.15502554178237915,
+      "learning_rate": 0.0019866666666666665,
+      "loss": 0.1507,
+      "step": 106
+    },
+    {
+      "epoch": 107.0,
+      "grad_norm": 0.16572409868240356,
+      "learning_rate": 0.0019844444444444445,
+      "loss": 0.1479,
+      "step": 107
+    },
+    {
+      "epoch": 108.0,
+      "grad_norm": 0.12365152686834335,
+      "learning_rate": 0.0019822222222222225,
+      "loss": 0.1416,
+      "step": 108
+    },
+    {
+      "epoch": 109.0,
+      "grad_norm": 0.18411065638065338,
+      "learning_rate": 0.00198,
+      "loss": 0.1401,
+      "step": 109
+    },
+    {
+      "epoch": 110.0,
+      "grad_norm": 0.15501633286476135,
+      "learning_rate": 0.001977777777777778,
+      "loss": 0.1352,
+      "step": 110
+    },
+    {
+      "epoch": 111.0,
+      "grad_norm": 0.151006817817688,
+      "learning_rate": 0.0019755555555555555,
+      "loss": 0.1297,
+      "step": 111
+    },
+    {
+      "epoch": 112.0,
+      "grad_norm": 0.1714586317539215,
+      "learning_rate": 0.0019733333333333334,
+      "loss": 0.1287,
+      "step": 112
+    },
+    {
+      "epoch": 113.0,
+      "grad_norm": 0.14063534140586853,
+      "learning_rate": 0.001971111111111111,
+      "loss": 0.124,
+      "step": 113
+    },
+    {
+      "epoch": 114.0,
+      "grad_norm": 0.15380504727363586,
+      "learning_rate": 0.001968888888888889,
+      "loss": 0.1213,
+      "step": 114
+    },
+    {
+      "epoch": 115.0,
+      "grad_norm": 0.13425365090370178,
+      "learning_rate": 0.0019666666666666665,
+      "loss": 0.1164,
+      "step": 115
+    },
+    {
+      "epoch": 116.0,
+      "grad_norm": 0.16011053323745728,
+      "learning_rate": 0.0019644444444444444,
+      "loss": 0.1153,
+      "step": 116
+    },
+    {
+      "epoch": 117.0,
+      "grad_norm": 0.12487512826919556,
+      "learning_rate": 0.0019622222222222224,
+      "loss": 0.1102,
+      "step": 117
+    },
+    {
+      "epoch": 118.0,
+      "grad_norm": 0.16518144309520721,
+      "learning_rate": 0.00196,
+      "loss": 0.1094,
+      "step": 118
+    },
+    {
+      "epoch": 119.0,
+      "grad_norm": 0.12795649468898773,
+      "learning_rate": 0.001957777777777778,
+      "loss": 0.1053,
+      "step": 119
+    },
+    {
+      "epoch": 120.0,
+      "grad_norm": 0.17986378073692322,
+      "learning_rate": 0.0019555555555555554,
+      "loss": 0.1028,
+      "step": 120
+    },
+    {
+      "epoch": 121.0,
+      "grad_norm": 0.12250766158103943,
+      "learning_rate": 0.0019533333333333334,
+      "loss": 0.0997,
+      "step": 121
+    },
+    {
+      "epoch": 122.0,
+      "grad_norm": 0.16212934255599976,
+      "learning_rate": 0.0019511111111111111,
+      "loss": 0.1001,
+      "step": 122
+    },
+    {
+      "epoch": 123.0,
+      "grad_norm": 0.11872928589582443,
+      "learning_rate": 0.001948888888888889,
+      "loss": 0.0955,
+      "step": 123
+    },
+    {
+      "epoch": 124.0,
+      "grad_norm": 0.14130988717079163,
+      "learning_rate": 0.0019466666666666669,
+      "loss": 0.0928,
+      "step": 124
+    },
+    {
+      "epoch": 125.0,
+      "grad_norm": 0.12492092698812485,
+      "learning_rate": 0.0019444444444444444,
+      "loss": 0.0901,
+      "step": 125
+    },
+    {
+      "epoch": 126.0,
+      "grad_norm": 0.1664874404668808,
+      "learning_rate": 0.0019422222222222224,
+      "loss": 0.0897,
+      "step": 126
+    },
+    {
+      "epoch": 127.0,
+      "grad_norm": 0.13373196125030518,
+      "learning_rate": 0.0019399999999999999,
+      "loss": 0.0873,
+      "step": 127
+    },
+    {
+      "epoch": 128.0,
+      "grad_norm": 0.15403704345226288,
+      "learning_rate": 0.0019377777777777778,
+      "loss": 0.0842,
+      "step": 128
+    },
+    {
+      "epoch": 129.0,
+      "grad_norm": 0.12807263433933258,
+      "learning_rate": 0.0019355555555555556,
+      "loss": 0.0815,
+      "step": 129
+    },
+    {
+      "epoch": 130.0,
+      "grad_norm": 0.11713800579309464,
+      "learning_rate": 0.0019333333333333333,
+      "loss": 0.0798,
+      "step": 130
+    },
+    {
+      "epoch": 131.0,
+      "grad_norm": 0.11321233212947845,
+      "learning_rate": 0.001931111111111111,
+      "loss": 0.0783,
+      "step": 131
+    },
+    {
+      "epoch": 132.0,
+      "grad_norm": 0.11122141778469086,
+      "learning_rate": 0.0019288888888888888,
+      "loss": 0.0762,
+      "step": 132
+    },
+    {
+      "epoch": 133.0,
+      "grad_norm": 0.11468366533517838,
+      "learning_rate": 0.0019266666666666668,
+      "loss": 0.0744,
+      "step": 133
+    },
+    {
+      "epoch": 134.0,
+      "grad_norm": 0.10164441913366318,
+      "learning_rate": 0.0019244444444444443,
+      "loss": 0.0715,
+      "step": 134
+    },
+    {
+      "epoch": 135.0,
+      "grad_norm": 0.1132010817527771,
+      "learning_rate": 0.0019222222222222223,
+      "loss": 0.0704,
+      "step": 135
+    },
+    {
+      "epoch": 136.0,
+      "grad_norm": 0.10453721135854721,
+      "learning_rate": 0.00192,
+      "loss": 0.0683,
+      "step": 136
+    },
+    {
+      "epoch": 137.0,
+      "grad_norm": 0.11656288057565689,
+      "learning_rate": 0.0019177777777777778,
+      "loss": 0.0671,
+      "step": 137
+    },
+    {
+      "epoch": 138.0,
+      "grad_norm": 0.10022811591625214,
+      "learning_rate": 0.0019155555555555555,
+      "loss": 0.0633,
+      "step": 138
+    },
+    {
+      "epoch": 139.0,
+      "grad_norm": 0.10532992333173752,
+      "learning_rate": 0.0019133333333333333,
+      "loss": 0.0652,
+      "step": 139
+    },
+    {
+      "epoch": 140.0,
+      "grad_norm": 0.1101190447807312,
+      "learning_rate": 0.0019111111111111113,
+      "loss": 0.0641,
+      "step": 140
+    },
+    {
+      "epoch": 141.0,
+      "grad_norm": 0.12256909906864166,
+      "learning_rate": 0.001908888888888889,
+      "loss": 0.0618,
+      "step": 141
+    },
+    {
+      "epoch": 142.0,
+      "grad_norm": 0.10809922963380814,
+      "learning_rate": 0.0019066666666666668,
+      "loss": 0.0604,
+      "step": 142
+    },
+    {
+      "epoch": 143.0,
+      "grad_norm": 0.10069157928228378,
+      "learning_rate": 0.0019044444444444445,
+      "loss": 0.0577,
+      "step": 143
+    },
+    {
+      "epoch": 144.0,
+      "grad_norm": 0.10927508026361465,
+      "learning_rate": 0.0019022222222222222,
+      "loss": 0.0568,
+      "step": 144
+    },
+    {
+      "epoch": 145.0,
+      "grad_norm": 0.11600210517644882,
+      "learning_rate": 0.0019,
+      "loss": 0.0569,
+      "step": 145
+    },
+    {
+      "epoch": 146.0,
+      "grad_norm": 0.12598258256912231,
+      "learning_rate": 0.0018977777777777777,
+      "loss": 0.0556,
+      "step": 146
+    },
+    {
+      "epoch": 147.0,
+      "grad_norm": 0.11476030200719833,
+      "learning_rate": 0.0018955555555555557,
+      "loss": 0.0537,
+      "step": 147
+    },
+    {
+      "epoch": 148.0,
+      "grad_norm": 0.13637323677539825,
+      "learning_rate": 0.0018933333333333335,
+      "loss": 0.0555,
+      "step": 148
+    },
+    {
+      "epoch": 149.0,
+      "grad_norm": 0.11303841322660446,
+      "learning_rate": 0.0018911111111111112,
+      "loss": 0.0514,
+      "step": 149
+    },
+    {
+      "epoch": 150.0,
+      "grad_norm": 0.12342188507318497,
+      "learning_rate": 0.001888888888888889,
+      "loss": 0.052,
+      "step": 150
+    },
+    {
+      "epoch": 151.0,
+      "grad_norm": 0.10494454950094223,
+      "learning_rate": 0.0018866666666666667,
+      "loss": 0.0496,
+      "step": 151
+    },
+    {
+      "epoch": 152.0,
+      "grad_norm": 0.18033015727996826,
+      "learning_rate": 0.0018844444444444444,
+      "loss": 0.0506,
+      "step": 152
+    },
+    {
+      "epoch": 153.0,
+      "grad_norm": 0.1556997448205948,
+      "learning_rate": 0.0018822222222222222,
+      "loss": 0.0493,
+      "step": 153
+    },
+    {
+      "epoch": 154.0,
+      "grad_norm": 0.14510443806648254,
+      "learning_rate": 0.00188,
+      "loss": 0.0488,
+      "step": 154
+    },
+    {
+      "epoch": 155.0,
+      "grad_norm": 0.18192656338214874,
+      "learning_rate": 0.001877777777777778,
+      "loss": 0.0489,
+      "step": 155
+    },
+    {
+      "epoch": 156.0,
+      "grad_norm": 0.11847592890262604,
+      "learning_rate": 0.0018755555555555557,
+      "loss": 0.0463,
+      "step": 156
+    },
+    {
+      "epoch": 157.0,
+      "grad_norm": 0.15888793766498566,
+      "learning_rate": 0.0018733333333333334,
+      "loss": 0.0462,
+      "step": 157
+    },
+    {
+      "epoch": 158.0,
+      "grad_norm": 0.12652742862701416,
+      "learning_rate": 0.0018711111111111112,
+      "loss": 0.0451,
+      "step": 158
+    },
+    {
+      "epoch": 159.0,
+      "grad_norm": 0.16675762832164764,
+      "learning_rate": 0.001868888888888889,
+      "loss": 0.0446,
+      "step": 159
+    },
+    {
+      "epoch": 160.0,
+      "grad_norm": 0.10220520198345184,
+      "learning_rate": 0.0018666666666666666,
+      "loss": 0.0422,
+      "step": 160
+    },
+    {
+      "epoch": 161.0,
+      "grad_norm": 0.11163638532161713,
+      "learning_rate": 0.0018644444444444444,
+      "loss": 0.0424,
+      "step": 161
+    },
+    {
+      "epoch": 162.0,
+      "grad_norm": 0.11750265210866928,
+      "learning_rate": 0.0018622222222222224,
+      "loss": 0.0418,
+      "step": 162
+    },
+    {
+      "epoch": 163.0,
+      "grad_norm": 0.09098522365093231,
+      "learning_rate": 0.00186,
+      "loss": 0.0395,
+      "step": 163
+    },
+    {
+      "epoch": 164.0,
+      "grad_norm": 0.10419722646474838,
+      "learning_rate": 0.0018577777777777779,
+      "loss": 0.0395,
+      "step": 164
+    },
+    {
+      "epoch": 165.0,
+      "grad_norm": 0.09210154414176941,
+      "learning_rate": 0.0018555555555555556,
+      "loss": 0.0382,
+      "step": 165
+    },
+    {
+      "epoch": 166.0,
+      "grad_norm": 0.10829305648803711,
+      "learning_rate": 0.0018533333333333334,
+      "loss": 0.0379,
+      "step": 166
+    },
+    {
+      "epoch": 167.0,
+      "grad_norm": 0.08274272084236145,
+      "learning_rate": 0.001851111111111111,
+      "loss": 0.0375,
+      "step": 167
+    },
+    {
+      "epoch": 168.0,
+      "grad_norm": 0.08142054080963135,
+      "learning_rate": 0.0018488888888888888,
+      "loss": 0.0363,
+      "step": 168
+    },
+    {
+      "epoch": 169.0,
+      "grad_norm": 0.1034446507692337,
+      "learning_rate": 0.0018466666666666668,
+      "loss": 0.037,
+      "step": 169
+    },
+    {
+      "epoch": 170.0,
+      "grad_norm": 0.08171215653419495,
+      "learning_rate": 0.0018444444444444446,
+      "loss": 0.0365,
+      "step": 170
+    },
+    {
+      "epoch": 171.0,
+      "grad_norm": 0.1313827484846115,
+      "learning_rate": 0.0018422222222222223,
+      "loss": 0.0372,
+      "step": 171
+    },
+    {
+      "epoch": 172.0,
+      "grad_norm": 0.08655200153589249,
+      "learning_rate": 0.00184,
+      "loss": 0.0352,
+      "step": 172
+    },
+    {
+      "epoch": 173.0,
+      "grad_norm": 0.1611122488975525,
+      "learning_rate": 0.0018377777777777778,
+      "loss": 0.0355,
+      "step": 173
+    },
+    {
+      "epoch": 174.0,
+      "grad_norm": 0.07916305959224701,
+      "learning_rate": 0.0018355555555555556,
+      "loss": 0.0338,
+      "step": 174
+    },
+    {
+      "epoch": 175.0,
+      "grad_norm": 0.14963340759277344,
+      "learning_rate": 0.0018333333333333333,
+      "loss": 0.0355,
+      "step": 175
+    },
+    {
+      "epoch": 176.0,
+      "grad_norm": 0.08377885818481445,
+      "learning_rate": 0.0018311111111111113,
+      "loss": 0.0327,
+      "step": 176
+    },
+    {
+      "epoch": 177.0,
+      "grad_norm": 0.1263081133365631,
+      "learning_rate": 0.0018288888888888888,
+      "loss": 0.0339,
+      "step": 177
+    },
+    {
+      "epoch": 178.0,
+      "grad_norm": 0.09036055207252502,
+      "learning_rate": 0.0018266666666666668,
+      "loss": 0.0319,
+      "step": 178
+    },
+    {
+      "epoch": 179.0,
+      "grad_norm": 0.08474431186914444,
+      "learning_rate": 0.0018244444444444445,
+      "loss": 0.0312,
+      "step": 179
+    },
+    {
+      "epoch": 180.0,
+      "grad_norm": 0.1496538668870926,
+      "learning_rate": 0.0018222222222222223,
+      "loss": 0.0325,
+      "step": 180
+    },
+    {
+      "epoch": 181.0,
+      "grad_norm": 0.0777578204870224,
+      "learning_rate": 0.00182,
+      "loss": 0.0301,
+      "step": 181
+    },
+    {
+      "epoch": 182.0,
+      "grad_norm": 0.14861616492271423,
+      "learning_rate": 0.0018177777777777778,
+      "loss": 0.0316,
+      "step": 182
+    },
+    {
+      "epoch": 183.0,
+      "grad_norm": 0.07743405550718307,
+      "learning_rate": 0.0018155555555555557,
+      "loss": 0.0302,
+      "step": 183
+    },
+    {
+      "epoch": 184.0,
+      "grad_norm": 0.12615865468978882,
+      "learning_rate": 0.0018133333333333332,
+      "loss": 0.0307,
+      "step": 184
+    },
+    {
+      "epoch": 185.0,
+      "grad_norm": 0.09123097360134125,
+      "learning_rate": 0.0018111111111111112,
+      "loss": 0.0283,
+      "step": 185
+    },
+    {
+      "epoch": 186.0,
+      "grad_norm": 0.08183681964874268,
+      "learning_rate": 0.001808888888888889,
+      "loss": 0.0288,
+      "step": 186
+    },
+    {
+      "epoch": 187.0,
+      "grad_norm": 0.12389379739761353,
+      "learning_rate": 0.0018066666666666667,
+      "loss": 0.029,
+      "step": 187
+    },
+    {
+      "epoch": 188.0,
+      "grad_norm": 0.07837918400764465,
+      "learning_rate": 0.0018044444444444445,
+      "loss": 0.0269,
+      "step": 188
+    },
+    {
+      "epoch": 189.0,
+      "grad_norm": 0.13364242017269135,
+      "learning_rate": 0.0018022222222222222,
+      "loss": 0.0289,
+      "step": 189
+    },
+    {
+      "epoch": 190.0,
+      "grad_norm": 0.06742086261510849,
+      "learning_rate": 0.0018000000000000002,
+      "loss": 0.0265,
+      "step": 190
+    },
+    {
+      "epoch": 191.0,
+      "grad_norm": 0.12599071860313416,
+      "learning_rate": 0.0017977777777777777,
+      "loss": 0.0281,
+      "step": 191
+    },
+    {
+      "epoch": 192.0,
+      "grad_norm": 0.0692119225859642,
+      "learning_rate": 0.0017955555555555557,
+      "loss": 0.0259,
+      "step": 192
+    },
+    {
+      "epoch": 193.0,
+      "grad_norm": 0.08335267007350922,
+      "learning_rate": 0.0017933333333333332,
+      "loss": 0.0256,
+      "step": 193
+    },
+    {
+      "epoch": 194.0,
+      "grad_norm": 0.09375467896461487,
+      "learning_rate": 0.0017911111111111112,
+      "loss": 0.0254,
+      "step": 194
+    },
+    {
+      "epoch": 195.0,
+      "grad_norm": 0.0680534839630127,
+      "learning_rate": 0.001788888888888889,
+      "loss": 0.0252,
+      "step": 195
+    },
+    {
+      "epoch": 196.0,
+      "grad_norm": 0.09214572608470917,
+      "learning_rate": 0.0017866666666666667,
+      "loss": 0.0259,
+      "step": 196
+    },
+    {
+      "epoch": 197.0,
+      "grad_norm": 0.07533039897680283,
+      "learning_rate": 0.0017844444444444446,
+      "loss": 0.0249,
+      "step": 197
+    },
+    {
+      "epoch": 198.0,
+      "grad_norm": 0.06569315493106842,
+      "learning_rate": 0.0017822222222222222,
+      "loss": 0.0245,
+      "step": 198
+    },
+    {
+      "epoch": 199.0,
+      "grad_norm": 0.11427601426839828,
+      "learning_rate": 0.0017800000000000001,
+      "loss": 0.0248,
+      "step": 199
+    },
+    {
+      "epoch": 200.0,
+      "grad_norm": 0.06313782185316086,
+      "learning_rate": 0.0017777777777777776,
+      "loss": 0.0237,
+      "step": 200
+    },
+    {
+      "epoch": 201.0,
+      "grad_norm": 0.08156254887580872,
+      "learning_rate": 0.0017755555555555556,
+      "loss": 0.0237,
+      "step": 201
+    },
+    {
+      "epoch": 202.0,
+      "grad_norm": 0.06546188145875931,
+      "learning_rate": 0.0017733333333333334,
+      "loss": 0.0229,
+      "step": 202
+    },
+    {
+      "epoch": 203.0,
+      "grad_norm": 0.06551296263933182,
+      "learning_rate": 0.001771111111111111,
+      "loss": 0.0227,
+      "step": 203
+    },
+    {
+      "epoch": 204.0,
+      "grad_norm": 0.06212465465068817,
+      "learning_rate": 0.001768888888888889,
+      "loss": 0.0224,
+      "step": 204
+    },
+    {
+      "epoch": 205.0,
+      "grad_norm": 0.06709878146648407,
+      "learning_rate": 0.0017666666666666666,
+      "loss": 0.0225,
+      "step": 205
+    },
+    {
+      "epoch": 206.0,
+      "grad_norm": 0.06637589633464813,
+      "learning_rate": 0.0017644444444444446,
+      "loss": 0.0228,
+      "step": 206
+    },
+    {
+      "epoch": 207.0,
+      "grad_norm": 0.07172456383705139,
+      "learning_rate": 0.001762222222222222,
+      "loss": 0.0218,
+      "step": 207
+    },
+    {
+      "epoch": 208.0,
+      "grad_norm": 0.06298528611660004,
+      "learning_rate": 0.00176,
+      "loss": 0.0219,
+      "step": 208
+    },
+    {
+      "epoch": 209.0,
+      "grad_norm": 0.06474582105875015,
+      "learning_rate": 0.001757777777777778,
+      "loss": 0.0205,
+      "step": 209
+    },
+    {
+      "epoch": 210.0,
+      "grad_norm": 0.064134880900383,
+      "learning_rate": 0.0017555555555555556,
+      "loss": 0.0211,
+      "step": 210
+    },
+    {
+      "epoch": 211.0,
+      "grad_norm": 0.06449710577726364,
+      "learning_rate": 0.0017533333333333335,
+      "loss": 0.0207,
+      "step": 211
+    },
+    {
+      "epoch": 212.0,
+      "grad_norm": 0.05953633040189743,
+      "learning_rate": 0.001751111111111111,
+      "loss": 0.0202,
+      "step": 212
+    },
+    {
+      "epoch": 213.0,
+      "grad_norm": 0.06264209747314453,
+      "learning_rate": 0.001748888888888889,
+      "loss": 0.0209,
+      "step": 213
+    },
+    {
+      "epoch": 214.0,
+      "grad_norm": 0.056829385459423065,
+      "learning_rate": 0.0017466666666666665,
+      "loss": 0.0203,
+      "step": 214
+    },
+    {
+      "epoch": 215.0,
+      "grad_norm": 0.06606360524892807,
+      "learning_rate": 0.0017444444444444445,
+      "loss": 0.0207,
+      "step": 215
+    },
+    {
+      "epoch": 216.0,
+      "grad_norm": 0.05813910439610481,
+      "learning_rate": 0.001742222222222222,
+      "loss": 0.02,
+      "step": 216
+    },
+    {
+      "epoch": 217.0,
+      "grad_norm": 0.06723108887672424,
+      "learning_rate": 0.00174,
+      "loss": 0.0192,
+      "step": 217
+    },
+    {
+      "epoch": 218.0,
+      "grad_norm": 0.05759569630026817,
+      "learning_rate": 0.001737777777777778,
+      "loss": 0.0194,
+      "step": 218
+    },
+    {
+      "epoch": 219.0,
+      "grad_norm": 0.07146891206502914,
+      "learning_rate": 0.0017355555555555555,
+      "loss": 0.0198,
+      "step": 219
+    },
+    {
+      "epoch": 220.0,
+      "grad_norm": 0.06193268671631813,
+      "learning_rate": 0.0017333333333333335,
+      "loss": 0.0196,
+      "step": 220
+    },
+    {
+      "epoch": 221.0,
+      "grad_norm": 0.05935683101415634,
+      "learning_rate": 0.001731111111111111,
+      "loss": 0.0185,
+      "step": 221
+    },
+    {
+      "epoch": 222.0,
+      "grad_norm": 0.05822020396590233,
+      "learning_rate": 0.001728888888888889,
+      "loss": 0.0188,
+      "step": 222
+    },
+    {
+      "epoch": 223.0,
+      "grad_norm": 0.05631418898701668,
+      "learning_rate": 0.0017266666666666667,
+      "loss": 0.0182,
+      "step": 223
+    },
+    {
+      "epoch": 224.0,
+      "grad_norm": 0.05941098555922508,
+      "learning_rate": 0.0017244444444444445,
+      "loss": 0.0185,
+      "step": 224
+    },
+    {
+      "epoch": 225.0,
+      "grad_norm": 0.060537729412317276,
+      "learning_rate": 0.0017222222222222224,
+      "loss": 0.0184,
+      "step": 225
+    },
+    {
+      "epoch": 226.0,
+      "grad_norm": 0.05600609630346298,
+      "learning_rate": 0.00172,
+      "loss": 0.0182,
+      "step": 226
+    },
+    {
+      "epoch": 227.0,
+      "grad_norm": 0.06475334614515305,
+      "learning_rate": 0.001717777777777778,
+      "loss": 0.0173,
+      "step": 227
+    },
+    {
+      "epoch": 228.0,
+      "grad_norm": 0.06878049671649933,
+      "learning_rate": 0.0017155555555555555,
+      "loss": 0.0179,
+      "step": 228
+    },
+    {
+      "epoch": 229.0,
+      "grad_norm": 0.07705635577440262,
+      "learning_rate": 0.0017133333333333334,
+      "loss": 0.0176,
+      "step": 229
+    },
+    {
+      "epoch": 230.0,
+      "grad_norm": 0.07096253335475922,
+      "learning_rate": 0.0017111111111111112,
+      "loss": 0.0174,
+      "step": 230
+    },
+    {
+      "epoch": 231.0,
+      "grad_norm": 0.0696563646197319,
+      "learning_rate": 0.001708888888888889,
+      "loss": 0.0178,
+      "step": 231
+    },
+    {
+      "epoch": 232.0,
+      "grad_norm": 0.08657005429267883,
+      "learning_rate": 0.0017066666666666669,
+      "loss": 0.0176,
+      "step": 232
+    },
+    {
+      "epoch": 233.0,
+      "grad_norm": 0.05430256947875023,
+      "learning_rate": 0.0017044444444444444,
+      "loss": 0.0166,
+      "step": 233
+    },
+    {
+      "epoch": 234.0,
+      "grad_norm": 0.09091652184724808,
+      "learning_rate": 0.0017022222222222224,
+      "loss": 0.0171,
+      "step": 234
+    },
+    {
+      "epoch": 235.0,
+      "grad_norm": 0.060334715992212296,
+      "learning_rate": 0.0017,
+      "loss": 0.0172,
+      "step": 235
+    },
+    {
+      "epoch": 236.0,
+      "grad_norm": 0.08081120997667313,
+      "learning_rate": 0.0016977777777777779,
+      "loss": 0.0163,
+      "step": 236
+    },
+    {
+      "epoch": 237.0,
+      "grad_norm": 0.05501524358987808,
+      "learning_rate": 0.0016955555555555556,
+      "loss": 0.0161,
+      "step": 237
+    },
+    {
+      "epoch": 238.0,
+      "grad_norm": 0.05665547773241997,
+      "learning_rate": 0.0016933333333333334,
+      "loss": 0.0161,
+      "step": 238
+    },
+    {
+      "epoch": 239.0,
+      "grad_norm": 0.06293457001447678,
+      "learning_rate": 0.0016911111111111111,
+      "loss": 0.0165,
+      "step": 239
+    },
+    {
+      "epoch": 240.0,
+      "grad_norm": 0.0509144552052021,
+      "learning_rate": 0.0016888888888888889,
+      "loss": 0.0156,
+      "step": 240
+    },
+    {
+      "epoch": 241.0,
+      "grad_norm": 0.05950072035193443,
+      "learning_rate": 0.0016866666666666668,
+      "loss": 0.0161,
+      "step": 241
+    },
+    {
+      "epoch": 242.0,
+      "grad_norm": 0.06005380302667618,
+      "learning_rate": 0.0016844444444444444,
+      "loss": 0.0159,
+      "step": 242
+    },
+    {
+      "epoch": 243.0,
+      "grad_norm": 0.05741410329937935,
+      "learning_rate": 0.0016822222222222223,
+      "loss": 0.0157,
+      "step": 243
+    },
+    {
+      "epoch": 244.0,
+      "grad_norm": 0.06560762971639633,
+      "learning_rate": 0.00168,
+      "loss": 0.0151,
+      "step": 244
+    },
+    {
+      "epoch": 245.0,
+      "grad_norm": 0.05892657861113548,
+      "learning_rate": 0.0016777777777777778,
+      "loss": 0.0158,
+      "step": 245
+    },
+    {
+      "epoch": 246.0,
+      "grad_norm": 0.07045791298151016,
+      "learning_rate": 0.0016755555555555556,
+      "loss": 0.0155,
+      "step": 246
+    },
+    {
+      "epoch": 247.0,
+      "grad_norm": 0.055328112095594406,
+      "learning_rate": 0.0016733333333333333,
+      "loss": 0.0149,
+      "step": 247
+    },
+    {
+      "epoch": 248.0,
+      "grad_norm": 0.06157761439681053,
+      "learning_rate": 0.0016711111111111113,
+      "loss": 0.0149,
+      "step": 248
+    },
+    {
+      "epoch": 249.0,
+      "grad_norm": 0.07325014472007751,
+      "learning_rate": 0.0016688888888888888,
+      "loss": 0.0149,
+      "step": 249
+    },
+    {
+      "epoch": 250.0,
+      "grad_norm": 0.055287521332502365,
+      "learning_rate": 0.0016666666666666668,
+      "loss": 0.0152,
+      "step": 250
+    },
+    {
+      "epoch": 251.0,
+      "grad_norm": 0.09518066048622131,
+      "learning_rate": 0.0016644444444444445,
+      "loss": 0.0162,
+      "step": 251
+    },
+    {
+      "epoch": 252.0,
+      "grad_norm": 0.05666990950703621,
+      "learning_rate": 0.0016622222222222223,
+      "loss": 0.0143,
+      "step": 252
+    },
+    {
+      "epoch": 253.0,
+      "grad_norm": 0.11963475495576859,
+      "learning_rate": 0.00166,
+      "loss": 0.0162,
+      "step": 253
+    },
+    {
+      "epoch": 254.0,
+      "grad_norm": 0.05737292766571045,
+      "learning_rate": 0.0016577777777777778,
+      "loss": 0.0143,
+      "step": 254
+    },
+    {
+      "epoch": 255.0,
+      "grad_norm": 0.06973493844270706,
+      "learning_rate": 0.0016555555555555555,
+      "loss": 0.0147,
+      "step": 255
+    },
+    {
+      "epoch": 256.0,
+      "grad_norm": 0.07595274597406387,
+      "learning_rate": 0.0016533333333333333,
+      "loss": 0.0144,
+      "step": 256
+    },
+    {
+      "epoch": 257.0,
+      "grad_norm": 0.06479303538799286,
+      "learning_rate": 0.0016511111111111112,
+      "loss": 0.0146,
+      "step": 257
+    },
+    {
+      "epoch": 258.0,
+      "grad_norm": 0.08630603551864624,
+      "learning_rate": 0.001648888888888889,
+      "loss": 0.0148,
+      "step": 258
+    },
+    {
+      "epoch": 259.0,
+      "grad_norm": 0.050509463995695114,
+      "learning_rate": 0.0016466666666666667,
+      "loss": 0.014,
+      "step": 259
+    },
+    {
+      "epoch": 260.0,
+      "grad_norm": 0.07676823437213898,
+      "learning_rate": 0.0016444444444444445,
+      "loss": 0.0146,
+      "step": 260
+    },
+    {
+      "epoch": 261.0,
+      "grad_norm": 0.06755410879850388,
+      "learning_rate": 0.0016422222222222222,
+      "loss": 0.0142,
+      "step": 261
+    },
+    {
+      "epoch": 262.0,
+      "grad_norm": 0.055984627455472946,
+      "learning_rate": 0.00164,
+      "loss": 0.0137,
+      "step": 262
+    },
+    {
+      "epoch": 263.0,
+      "grad_norm": 0.06891737133264542,
+      "learning_rate": 0.0016377777777777777,
+      "loss": 0.0141,
+      "step": 263
+    },
+    {
+      "epoch": 264.0,
+      "grad_norm": 0.06404928863048553,
+      "learning_rate": 0.0016355555555555557,
+      "loss": 0.014,
+      "step": 264
+    },
+    {
+      "epoch": 265.0,
+      "grad_norm": 0.06907233595848083,
+      "learning_rate": 0.0016333333333333334,
+      "loss": 0.0139,
+      "step": 265
+    },
+    {
+      "epoch": 266.0,
+      "grad_norm": 0.07640419155359268,
+      "learning_rate": 0.0016311111111111112,
+      "loss": 0.0139,
+      "step": 266
+    },
+    {
+      "epoch": 267.0,
+      "grad_norm": 0.047443002462387085,
+      "learning_rate": 0.001628888888888889,
+      "loss": 0.0127,
+      "step": 267
+    },
+    {
+      "epoch": 268.0,
+      "grad_norm": 0.10862824320793152,
+      "learning_rate": 0.0016266666666666667,
+      "loss": 0.0139,
+      "step": 268
+    },
+    {
+      "epoch": 269.0,
+      "grad_norm": 0.054732754826545715,
+      "learning_rate": 0.0016244444444444444,
+      "loss": 0.0135,
+      "step": 269
+    },
+    {
+      "epoch": 270.0,
+      "grad_norm": 0.06462664902210236,
+      "learning_rate": 0.0016222222222222222,
+      "loss": 0.0124,
+      "step": 270
+    },
+    {
+      "epoch": 271.0,
+      "grad_norm": 0.0714699774980545,
+      "learning_rate": 0.0016200000000000001,
+      "loss": 0.0127,
+      "step": 271
+    },
+    {
+      "epoch": 272.0,
+      "grad_norm": 0.05089233070611954,
+      "learning_rate": 0.0016177777777777779,
+      "loss": 0.0125,
+      "step": 272
+    },
+    {
+      "epoch": 273.0,
+      "grad_norm": 0.1049375906586647,
+      "learning_rate": 0.0016155555555555556,
+      "loss": 0.0142,
+      "step": 273
+    },
+    {
+      "epoch": 274.0,
+      "grad_norm": 0.052508674561977386,
+      "learning_rate": 0.0016133333333333334,
+      "loss": 0.0125,
+      "step": 274
+    },
+    {
+      "epoch": 275.0,
+      "grad_norm": 0.05343548581004143,
+      "learning_rate": 0.0016111111111111111,
+      "loss": 0.0127,
+      "step": 275
+    },
+    {
+      "epoch": 276.0,
+      "grad_norm": 0.0628175362944603,
+      "learning_rate": 0.0016088888888888889,
+      "loss": 0.0127,
+      "step": 276
+    },
+    {
+      "epoch": 277.0,
+      "grad_norm": 0.05540168285369873,
+      "learning_rate": 0.0016066666666666666,
+      "loss": 0.013,
+      "step": 277
+    },
+    {
+      "epoch": 278.0,
+      "grad_norm": 0.06312675029039383,
+      "learning_rate": 0.0016044444444444444,
+      "loss": 0.0127,
+      "step": 278
+    },
+    {
+      "epoch": 279.0,
+      "grad_norm": 0.04804935306310654,
+      "learning_rate": 0.0016022222222222223,
+      "loss": 0.0123,
+      "step": 279
+    },
+    {
+      "epoch": 280.0,
+      "grad_norm": 0.07451438903808594,
+      "learning_rate": 0.0016,
+      "loss": 0.0137,
+      "step": 280
+    },
+    {
+      "epoch": 281.0,
+      "grad_norm": 0.06311290711164474,
+      "learning_rate": 0.0015977777777777778,
+      "loss": 0.0129,
+      "step": 281
+    },
+    {
+      "epoch": 282.0,
+      "grad_norm": 0.04818476364016533,
+      "learning_rate": 0.0015955555555555556,
+      "loss": 0.0114,
+      "step": 282
+    },
+    {
+      "epoch": 283.0,
+      "grad_norm": 0.05734807625412941,
+      "learning_rate": 0.0015933333333333333,
+      "loss": 0.012,
+      "step": 283
+    },
+    {
+      "epoch": 284.0,
+      "grad_norm": 0.05613243579864502,
+      "learning_rate": 0.001591111111111111,
+      "loss": 0.0122,
+      "step": 284
+    },
+    {
+      "epoch": 285.0,
+      "grad_norm": 0.05182022973895073,
+      "learning_rate": 0.0015888888888888888,
+      "loss": 0.0114,
+      "step": 285
+    },
+    {
+      "epoch": 286.0,
+      "grad_norm": 0.058295805007219315,
+      "learning_rate": 0.0015866666666666668,
+      "loss": 0.0117,
+      "step": 286
+    },
+    {
+      "epoch": 287.0,
+      "grad_norm": 0.04197324812412262,
+      "learning_rate": 0.0015844444444444445,
+      "loss": 0.0117,
+      "step": 287
+    },
+    {
+      "epoch": 288.0,
+      "grad_norm": 0.05144956335425377,
+      "learning_rate": 0.0015822222222222223,
+      "loss": 0.0113,
+      "step": 288
+    },
+    {
+      "epoch": 289.0,
+      "grad_norm": 0.0465347096323967,
+      "learning_rate": 0.00158,
+      "loss": 0.0119,
+      "step": 289
+    },
+    {
+      "epoch": 290.0,
+      "grad_norm": 0.04576906934380531,
+      "learning_rate": 0.0015777777777777778,
+      "loss": 0.0112,
+      "step": 290
+    },
+    {
+      "epoch": 291.0,
+      "grad_norm": 0.040685515850782394,
+      "learning_rate": 0.0015755555555555557,
+      "loss": 0.0114,
+      "step": 291
+    },
+    {
+      "epoch": 292.0,
+      "grad_norm": 0.04453601688146591,
+      "learning_rate": 0.0015733333333333333,
+      "loss": 0.0113,
+      "step": 292
+    },
+    {
+      "epoch": 293.0,
+      "grad_norm": 0.053638309240341187,
+      "learning_rate": 0.0015711111111111112,
+      "loss": 0.0114,
+      "step": 293
+    },
+    {
+      "epoch": 294.0,
+      "grad_norm": 0.04391471669077873,
+      "learning_rate": 0.001568888888888889,
+      "loss": 0.0112,
+      "step": 294
+    },
+    {
+      "epoch": 295.0,
+      "grad_norm": 0.045375898480415344,
+      "learning_rate": 0.0015666666666666667,
+      "loss": 0.0113,
+      "step": 295
+    },
+    {
+      "epoch": 296.0,
+      "grad_norm": 0.04697088524699211,
+      "learning_rate": 0.0015644444444444445,
+      "loss": 0.0104,
+      "step": 296
+    },
+    {
+      "epoch": 297.0,
+      "grad_norm": 0.04696211591362953,
+      "learning_rate": 0.0015622222222222222,
+      "loss": 0.011,
+      "step": 297
+    },
+    {
+      "epoch": 298.0,
+      "grad_norm": 0.05330660939216614,
+      "learning_rate": 0.0015600000000000002,
+      "loss": 0.0108,
+      "step": 298
+    },
+    {
+      "epoch": 299.0,
+      "grad_norm": 0.047231633216142654,
+      "learning_rate": 0.0015577777777777777,
+      "loss": 0.0109,
+      "step": 299
+    },
+    {
+      "epoch": 300.0,
+      "grad_norm": 0.06341907382011414,
+      "learning_rate": 0.0015555555555555557,
+      "loss": 0.0107,
+      "step": 300
+    },
+    {
+      "epoch": 301.0,
+      "grad_norm": 0.044753991067409515,
+      "learning_rate": 0.0015533333333333332,
+      "loss": 0.0104,
+      "step": 301
+    },
+    {
+      "epoch": 302.0,
+      "grad_norm": 0.056486308574676514,
+      "learning_rate": 0.0015511111111111112,
+      "loss": 0.0111,
+      "step": 302
+    },
+    {
+      "epoch": 303.0,
+      "grad_norm": 0.04415280744433403,
+      "learning_rate": 0.001548888888888889,
+      "loss": 0.0103,
+      "step": 303
+    },
+    {
+      "epoch": 304.0,
+      "grad_norm": 0.049498315900564194,
+      "learning_rate": 0.0015466666666666667,
+      "loss": 0.0111,
+      "step": 304
+    },
+    {
+      "epoch": 305.0,
+      "grad_norm": 0.04238919913768768,
+      "learning_rate": 0.0015444444444444446,
+      "loss": 0.0106,
+      "step": 305
+    },
+    {
+      "epoch": 306.0,
+      "grad_norm": 0.04967685788869858,
+      "learning_rate": 0.0015422222222222222,
+      "loss": 0.0105,
+      "step": 306
+    },
+    {
+      "epoch": 307.0,
+      "grad_norm": 0.043164417147636414,
+      "learning_rate": 0.0015400000000000001,
+      "loss": 0.0103,
+      "step": 307
+    },
+    {
+      "epoch": 308.0,
+      "grad_norm": 0.05318186432123184,
+      "learning_rate": 0.0015377777777777777,
+      "loss": 0.0101,
+      "step": 308
+    },
+    {
+      "epoch": 309.0,
+      "grad_norm": 0.05574382469058037,
+      "learning_rate": 0.0015355555555555556,
+      "loss": 0.0102,
+      "step": 309
+    },
+    {
+      "epoch": 310.0,
+      "grad_norm": 0.036135945469141006,
+      "learning_rate": 0.0015333333333333334,
+      "loss": 0.0097,
+      "step": 310
+    },
+    {
+      "epoch": 311.0,
+      "grad_norm": 0.07383669912815094,
+      "learning_rate": 0.0015311111111111111,
+      "loss": 0.0104,
+      "step": 311
+    },
+    {
+      "epoch": 312.0,
+      "grad_norm": 0.04554266110062599,
+      "learning_rate": 0.001528888888888889,
+      "loss": 0.0099,
+      "step": 312
+    },
+    {
+      "epoch": 313.0,
+      "grad_norm": 0.07572053372859955,
+      "learning_rate": 0.0015266666666666666,
+      "loss": 0.0109,
+      "step": 313
+    },
+    {
+      "epoch": 314.0,
+      "grad_norm": 0.05518485605716705,
+      "learning_rate": 0.0015244444444444446,
+      "loss": 0.0103,
+      "step": 314
+    },
+    {
+      "epoch": 315.0,
+      "grad_norm": 0.07998452335596085,
+      "learning_rate": 0.0015222222222222221,
+      "loss": 0.0105,
+      "step": 315
+    },
+    {
+      "epoch": 316.0,
+      "grad_norm": 0.058478306978940964,
+      "learning_rate": 0.00152,
+      "loss": 0.0094,
+      "step": 316
+    },
+    {
+      "epoch": 317.0,
+      "grad_norm": 0.04713175818324089,
+      "learning_rate": 0.0015177777777777776,
+      "loss": 0.0099,
+      "step": 317
+    },
+    {
+      "epoch": 318.0,
+      "grad_norm": 0.08784548938274384,
+      "learning_rate": 0.0015155555555555556,
+      "loss": 0.0111,
+      "step": 318
+    },
+    {
+      "epoch": 319.0,
+      "grad_norm": 0.0438971072435379,
+      "learning_rate": 0.0015133333333333335,
+      "loss": 0.0097,
+      "step": 319
+    },
+    {
+      "epoch": 320.0,
+      "grad_norm": 0.0707026869058609,
+      "learning_rate": 0.001511111111111111,
+      "loss": 0.01,
+      "step": 320
+    },
+    {
+      "epoch": 321.0,
+      "grad_norm": 0.04169069975614548,
+      "learning_rate": 0.001508888888888889,
+      "loss": 0.0095,
+      "step": 321
+    },
+    {
+      "epoch": 322.0,
+      "grad_norm": 0.04208659380674362,
+      "learning_rate": 0.0015066666666666666,
+      "loss": 0.0091,
+      "step": 322
+    },
+    {
+      "epoch": 323.0,
+      "grad_norm": 0.04683458432555199,
+      "learning_rate": 0.0015044444444444445,
+      "loss": 0.0098,
+      "step": 323
+    },
+    {
+      "epoch": 324.0,
+      "grad_norm": 0.0388614684343338,
+      "learning_rate": 0.001502222222222222,
+      "loss": 0.0093,
+      "step": 324
+    },
+    {
+      "epoch": 325.0,
+      "grad_norm": 0.04117365926504135,
+      "learning_rate": 0.0015,
+      "loss": 0.0091,
+      "step": 325
+    },
+    {
+      "epoch": 326.0,
+      "grad_norm": 0.042431872338056564,
+      "learning_rate": 0.001497777777777778,
+      "loss": 0.0098,
+      "step": 326
+    },
+    {
+      "epoch": 327.0,
+      "grad_norm": 0.04213636368513107,
+      "learning_rate": 0.0014955555555555555,
+      "loss": 0.0094,
+      "step": 327
+    },
+    {
+      "epoch": 328.0,
+      "grad_norm": 0.03774186596274376,
+      "learning_rate": 0.0014933333333333335,
+      "loss": 0.0088,
+      "step": 328
+    },
+    {
+      "epoch": 329.0,
+      "grad_norm": 0.040397752076387405,
+      "learning_rate": 0.001491111111111111,
+      "loss": 0.0093,
+      "step": 329
+    },
+    {
+      "epoch": 330.0,
+      "grad_norm": 0.04346088692545891,
+      "learning_rate": 0.001488888888888889,
+      "loss": 0.0092,
+      "step": 330
+    },
+    {
+      "epoch": 331.0,
+      "grad_norm": 0.03852611780166626,
+      "learning_rate": 0.0014866666666666665,
+      "loss": 0.0086,
+      "step": 331
+    },
+    {
+      "epoch": 332.0,
+      "grad_norm": 0.052108846604824066,
+      "learning_rate": 0.0014844444444444445,
+      "loss": 0.01,
+      "step": 332
+    },
+    {
+      "epoch": 333.0,
+      "grad_norm": 0.047736842185258865,
+      "learning_rate": 0.0014822222222222224,
+      "loss": 0.009,
+      "step": 333
+    },
+    {
+      "epoch": 334.0,
+      "grad_norm": 0.042882874608039856,
+      "learning_rate": 0.00148,
+      "loss": 0.0087,
+      "step": 334
+    },
+    {
+      "epoch": 335.0,
+      "grad_norm": 0.04501271992921829,
+      "learning_rate": 0.001477777777777778,
+      "loss": 0.0089,
+      "step": 335
+    },
+    {
+      "epoch": 336.0,
+      "grad_norm": 0.04159759730100632,
+      "learning_rate": 0.0014755555555555555,
+      "loss": 0.0091,
+      "step": 336
+    },
+    {
+      "epoch": 337.0,
+      "grad_norm": 0.04928553104400635,
+      "learning_rate": 0.0014733333333333334,
+      "loss": 0.0093,
+      "step": 337
+    },
+    {
+      "epoch": 338.0,
+      "grad_norm": 0.04055558145046234,
+      "learning_rate": 0.001471111111111111,
+      "loss": 0.0088,
+      "step": 338
+    },
+    {
+      "epoch": 339.0,
+      "grad_norm": 0.04683249071240425,
+      "learning_rate": 0.001468888888888889,
+      "loss": 0.0088,
+      "step": 339
+    },
+    {
+      "epoch": 340.0,
+      "grad_norm": 0.05597413331270218,
+      "learning_rate": 0.0014666666666666667,
+      "loss": 0.0096,
+      "step": 340
+    },
+    {
+      "epoch": 341.0,
+      "grad_norm": 0.04371971637010574,
+      "learning_rate": 0.0014644444444444444,
+      "loss": 0.0087,
+      "step": 341
+    },
+    {
+      "epoch": 342.0,
+      "grad_norm": 0.07671177387237549,
+      "learning_rate": 0.0014622222222222224,
+      "loss": 0.0091,
+      "step": 342
+    },
+    {
+      "epoch": 343.0,
+      "grad_norm": 0.03556251898407936,
+      "learning_rate": 0.00146,
+      "loss": 0.0084,
+      "step": 343
+    },
+    {
+      "epoch": 344.0,
+      "grad_norm": 0.05906197428703308,
+      "learning_rate": 0.0014577777777777779,
+      "loss": 0.0088,
+      "step": 344
+    },
+    {
+      "epoch": 345.0,
+      "grad_norm": 0.060906119644641876,
+      "learning_rate": 0.0014555555555555554,
+      "loss": 0.0089,
+      "step": 345
+    },
+    {
+      "epoch": 346.0,
+      "grad_norm": 0.036098964512348175,
+      "learning_rate": 0.0014533333333333334,
+      "loss": 0.0083,
+      "step": 346
+    },
+    {
+      "epoch": 347.0,
+      "grad_norm": 0.07324780523777008,
+      "learning_rate": 0.0014511111111111111,
+      "loss": 0.0087,
+      "step": 347
+    },
+    {
+      "epoch": 348.0,
+      "grad_norm": 0.05585562065243721,
+      "learning_rate": 0.0014488888888888889,
+      "loss": 0.0084,
+      "step": 348
+    },
+    {
+      "epoch": 349.0,
+      "grad_norm": 0.053688161075115204,
+      "learning_rate": 0.0014466666666666668,
+      "loss": 0.0087,
+      "step": 349
+    },
+    {
+      "epoch": 350.0,
+      "grad_norm": 0.06806395202875137,
+      "learning_rate": 0.0014444444444444444,
+      "loss": 0.0091,
+      "step": 350
+    },
+    {
+      "epoch": 351.0,
+      "grad_norm": 0.052333708852529526,
+      "learning_rate": 0.0014422222222222223,
+      "loss": 0.0083,
+      "step": 351
+    },
+    {
+      "epoch": 352.0,
+      "grad_norm": 0.06652957201004028,
+      "learning_rate": 0.0014399999999999999,
+      "loss": 0.0084,
+      "step": 352
+    },
+    {
+      "epoch": 353.0,
+      "grad_norm": 0.09658028930425644,
+      "learning_rate": 0.0014377777777777778,
+      "loss": 0.0095,
+      "step": 353
+    },
+    {
+      "epoch": 354.0,
+      "grad_norm": 0.03924448788166046,
+      "learning_rate": 0.0014355555555555556,
+      "loss": 0.0083,
+      "step": 354
+    },
+    {
+      "epoch": 355.0,
+      "grad_norm": 0.12581630051136017,
+      "learning_rate": 0.0014333333333333333,
+      "loss": 0.0102,
+      "step": 355
+    },
+    {
+      "epoch": 356.0,
+      "grad_norm": 0.04559926316142082,
+      "learning_rate": 0.001431111111111111,
+      "loss": 0.0084,
+      "step": 356
+    },
+    {
+      "epoch": 357.0,
+      "grad_norm": 0.08532512933015823,
+      "learning_rate": 0.0014288888888888888,
+      "loss": 0.0089,
+      "step": 357
+    },
+    {
+      "epoch": 358.0,
+      "grad_norm": 0.05258309841156006,
+      "learning_rate": 0.0014266666666666668,
+      "loss": 0.0088,
+      "step": 358
+    },
+    {
+      "epoch": 359.0,
+      "grad_norm": 0.050833553075790405,
+      "learning_rate": 0.0014244444444444443,
+      "loss": 0.0083,
+      "step": 359
+    },
+    {
+      "epoch": 360.0,
+      "grad_norm": 0.07133360952138901,
+      "learning_rate": 0.0014222222222222223,
+      "loss": 0.0086,
+      "step": 360
+    },
+    {
+      "epoch": 361.0,
+      "grad_norm": 0.04288196563720703,
+      "learning_rate": 0.00142,
+      "loss": 0.0081,
+      "step": 361
+    },
+    {
+      "epoch": 362.0,
+      "grad_norm": 0.050019655376672745,
+      "learning_rate": 0.0014177777777777778,
+      "loss": 0.0081,
+      "step": 362
+    },
+    {
+      "epoch": 363.0,
+      "grad_norm": 0.04713081568479538,
+      "learning_rate": 0.0014155555555555555,
+      "loss": 0.0078,
+      "step": 363
+    },
+    {
+      "epoch": 364.0,
+      "grad_norm": 0.03391753509640694,
+      "learning_rate": 0.0014133333333333333,
+      "loss": 0.0079,
+      "step": 364
+    },
+    {
+      "epoch": 365.0,
+      "grad_norm": 0.05800512805581093,
+      "learning_rate": 0.0014111111111111112,
+      "loss": 0.0083,
+      "step": 365
+    },
+    {
+      "epoch": 366.0,
+      "grad_norm": 0.04860348999500275,
+      "learning_rate": 0.001408888888888889,
+      "loss": 0.008,
+      "step": 366
+    },
+    {
+      "epoch": 367.0,
+      "grad_norm": 0.04747091606259346,
+      "learning_rate": 0.0014066666666666667,
+      "loss": 0.008,
+      "step": 367
+    },
+    {
+      "epoch": 368.0,
+      "grad_norm": 0.05328553542494774,
+      "learning_rate": 0.0014044444444444445,
+      "loss": 0.008,
+      "step": 368
+    },
+    {
+      "epoch": 369.0,
+      "grad_norm": 0.033220142126083374,
+      "learning_rate": 0.0014022222222222222,
+      "loss": 0.0072,
+      "step": 369
+    },
+    {
+      "epoch": 370.0,
+      "grad_norm": 0.03936196118593216,
+      "learning_rate": 0.0014,
+      "loss": 0.0072,
+      "step": 370
+    },
+    {
+      "epoch": 371.0,
+      "grad_norm": 0.035857848823070526,
+      "learning_rate": 0.0013977777777777777,
+      "loss": 0.0077,
+      "step": 371
+    },
+    {
+      "epoch": 372.0,
+      "grad_norm": 0.04019852727651596,
+      "learning_rate": 0.0013955555555555557,
+      "loss": 0.0074,
+      "step": 372
+    },
+    {
+      "epoch": 373.0,
+      "grad_norm": 0.03362793102860451,
+      "learning_rate": 0.0013933333333333334,
+      "loss": 0.0076,
+      "step": 373
+    },
+    {
+      "epoch": 374.0,
+      "grad_norm": 0.04266555234789848,
+      "learning_rate": 0.0013911111111111112,
+      "loss": 0.0077,
+      "step": 374
+    },
+    {
+      "epoch": 375.0,
+      "grad_norm": 0.0359489843249321,
+      "learning_rate": 0.001388888888888889,
+      "loss": 0.0072,
+      "step": 375
+    },
+    {
+      "epoch": 376.0,
+      "grad_norm": 0.05567077919840813,
+      "learning_rate": 0.0013866666666666667,
+      "loss": 0.0082,
+      "step": 376
+    },
+    {
+      "epoch": 377.0,
+      "grad_norm": 0.044729381799697876,
+      "learning_rate": 0.0013844444444444444,
+      "loss": 0.0075,
+      "step": 377
+    },
+    {
+      "epoch": 378.0,
+      "grad_norm": 0.0389576219022274,
+      "learning_rate": 0.0013822222222222222,
+      "loss": 0.0075,
+      "step": 378
+    },
+    {
+      "epoch": 379.0,
+      "grad_norm": 0.03594253212213516,
+      "learning_rate": 0.00138,
+      "loss": 0.0071,
+      "step": 379
+    },
+    {
+      "epoch": 380.0,
+      "grad_norm": 0.03960775211453438,
+      "learning_rate": 0.001377777777777778,
+      "loss": 0.0077,
+      "step": 380
+    },
+    {
+      "epoch": 381.0,
+      "grad_norm": 0.036085862666368484,
+      "learning_rate": 0.0013755555555555556,
+      "loss": 0.0075,
+      "step": 381
+    },
+    {
+      "epoch": 382.0,
+      "grad_norm": 0.04432854801416397,
+      "learning_rate": 0.0013733333333333334,
+      "loss": 0.0077,
+      "step": 382
+    },
+    {
+      "epoch": 383.0,
+      "grad_norm": 0.04181389883160591,
+      "learning_rate": 0.0013711111111111111,
+      "loss": 0.0074,
+      "step": 383
+    },
+    {
+      "epoch": 384.0,
+      "grad_norm": 0.04760071262717247,
+      "learning_rate": 0.0013688888888888889,
+      "loss": 0.0075,
+      "step": 384
+    },
+    {
+      "epoch": 385.0,
+      "grad_norm": 0.03643626347184181,
+      "learning_rate": 0.0013666666666666666,
+      "loss": 0.0074,
+      "step": 385
+    },
+    {
+      "epoch": 386.0,
+      "grad_norm": 0.03681834042072296,
+      "learning_rate": 0.0013644444444444444,
+      "loss": 0.0072,
+      "step": 386
+    },
+    {
+      "epoch": 387.0,
+      "grad_norm": 0.053312405943870544,
+      "learning_rate": 0.0013622222222222223,
+      "loss": 0.0081,
+      "step": 387
+    },
+    {
+      "epoch": 388.0,
+      "grad_norm": 0.0378199964761734,
+      "learning_rate": 0.00136,
+      "loss": 0.0071,
+      "step": 388
+    },
+    {
+      "epoch": 389.0,
+      "grad_norm": 0.035717807710170746,
+      "learning_rate": 0.0013577777777777778,
+      "loss": 0.0072,
+      "step": 389
+    },
+    {
+      "epoch": 390.0,
+      "grad_norm": 0.051514606922864914,
+      "learning_rate": 0.0013555555555555556,
+      "loss": 0.0077,
+      "step": 390
+    },
+    {
+      "epoch": 391.0,
+      "grad_norm": 0.039895568042993546,
+      "learning_rate": 0.0013533333333333333,
+      "loss": 0.0072,
+      "step": 391
+    },
+    {
+      "epoch": 392.0,
+      "grad_norm": 0.03736487403512001,
+      "learning_rate": 0.001351111111111111,
+      "loss": 0.0068,
+      "step": 392
+    },
+    {
+      "epoch": 393.0,
+      "grad_norm": 0.034880317747592926,
+      "learning_rate": 0.0013488888888888888,
+      "loss": 0.0071,
+      "step": 393
+    },
+    {
+      "epoch": 394.0,
+      "grad_norm": 0.053845588117837906,
+      "learning_rate": 0.0013466666666666668,
+      "loss": 0.0077,
+      "step": 394
+    },
+    {
+      "epoch": 395.0,
+      "grad_norm": 0.034650277346372604,
+      "learning_rate": 0.0013444444444444445,
+      "loss": 0.007,
+      "step": 395
+    },
+    {
+      "epoch": 396.0,
+      "grad_norm": 0.06803309917449951,
+      "learning_rate": 0.0013422222222222223,
+      "loss": 0.0079,
+      "step": 396
+    },
+    {
+      "epoch": 397.0,
+      "grad_norm": 0.03568552806973457,
+      "learning_rate": 0.00134,
+      "loss": 0.0068,
+      "step": 397
+    },
+    {
+      "epoch": 398.0,
+      "grad_norm": 0.04094022884964943,
+      "learning_rate": 0.0013377777777777778,
+      "loss": 0.0073,
+      "step": 398
+    },
+    {
+      "epoch": 399.0,
+      "grad_norm": 0.045098766684532166,
+      "learning_rate": 0.0013355555555555555,
+      "loss": 0.0072,
+      "step": 399
+    },
+    {
+      "epoch": 400.0,
+      "grad_norm": 0.03938360884785652,
+      "learning_rate": 0.0013333333333333333,
+      "loss": 0.0073,
+      "step": 400
+    },
+    {
+      "epoch": 401.0,
+      "grad_norm": 0.03678111359477043,
+      "learning_rate": 0.0013311111111111113,
+      "loss": 0.0066,
+      "step": 401
+    },
+    {
+      "epoch": 402.0,
+      "grad_norm": 0.03792262822389603,
+      "learning_rate": 0.0013288888888888888,
+      "loss": 0.0066,
+      "step": 402
+    },
+    {
+      "epoch": 403.0,
+      "grad_norm": 0.04037335887551308,
+      "learning_rate": 0.0013266666666666667,
+      "loss": 0.0069,
+      "step": 403
+    },
+    {
+      "epoch": 404.0,
+      "grad_norm": 0.0448298305273056,
+      "learning_rate": 0.0013244444444444445,
+      "loss": 0.0072,
+      "step": 404
+    },
+    {
+      "epoch": 405.0,
+      "grad_norm": 0.029835334047675133,
+      "learning_rate": 0.0013222222222222222,
+      "loss": 0.0067,
+      "step": 405
+    },
+    {
+      "epoch": 406.0,
+      "grad_norm": 0.03127991408109665,
+      "learning_rate": 0.00132,
+      "loss": 0.0066,
+      "step": 406
+    },
+    {
+      "epoch": 407.0,
+      "grad_norm": 0.034645188599824905,
+      "learning_rate": 0.0013177777777777777,
+      "loss": 0.0065,
+      "step": 407
+    },
+    {
+      "epoch": 408.0,
+      "grad_norm": 0.03312946483492851,
+      "learning_rate": 0.0013155555555555557,
+      "loss": 0.0066,
+      "step": 408
+    },
+    {
+      "epoch": 409.0,
+      "grad_norm": 0.03247128427028656,
+      "learning_rate": 0.0013133333333333332,
+      "loss": 0.0064,
+      "step": 409
+    },
+    {
+      "epoch": 410.0,
+      "grad_norm": 0.03561067953705788,
+      "learning_rate": 0.0013111111111111112,
+      "loss": 0.0064,
+      "step": 410
+    },
+    {
+      "epoch": 411.0,
+      "grad_norm": 0.03821596875786781,
+      "learning_rate": 0.001308888888888889,
+      "loss": 0.0066,
+      "step": 411
+    },
+    {
+      "epoch": 412.0,
+      "grad_norm": 0.0356701985001564,
+      "learning_rate": 0.0013066666666666667,
+      "loss": 0.0065,
+      "step": 412
+    },
+    {
+      "epoch": 413.0,
+      "grad_norm": 0.04700474441051483,
+      "learning_rate": 0.0013044444444444444,
+      "loss": 0.0066,
+      "step": 413
+    },
+    {
+      "epoch": 414.0,
+      "grad_norm": 0.03856738656759262,
+      "learning_rate": 0.0013022222222222222,
+      "loss": 0.0067,
+      "step": 414
+    },
+    {
+      "epoch": 415.0,
+      "grad_norm": 0.03348975256085396,
+      "learning_rate": 0.0013000000000000002,
+      "loss": 0.0065,
+      "step": 415
+    },
+    {
+      "epoch": 416.0,
+      "grad_norm": 0.03193169832229614,
+      "learning_rate": 0.0012977777777777777,
+      "loss": 0.0064,
+      "step": 416
+    },
+    {
+      "epoch": 417.0,
+      "grad_norm": 0.0403468981385231,
+      "learning_rate": 0.0012955555555555557,
+      "loss": 0.0063,
+      "step": 417
+    },
+    {
+      "epoch": 418.0,
+      "grad_norm": 0.03923949599266052,
+      "learning_rate": 0.0012933333333333332,
+      "loss": 0.0068,
+      "step": 418
+    },
+    {
+      "epoch": 419.0,
+      "grad_norm": 0.03874557837843895,
+      "learning_rate": 0.0012911111111111111,
+      "loss": 0.007,
+      "step": 419
+    },
+    {
+      "epoch": 420.0,
+      "grad_norm": 0.03878943994641304,
+      "learning_rate": 0.001288888888888889,
+      "loss": 0.0066,
+      "step": 420
+    },
+    {
+      "epoch": 421.0,
+      "grad_norm": 0.030541859567165375,
+      "learning_rate": 0.0012866666666666666,
+      "loss": 0.0061,
+      "step": 421
+    },
+    {
+      "epoch": 422.0,
+      "grad_norm": 0.05509382113814354,
+      "learning_rate": 0.0012844444444444446,
+      "loss": 0.0068,
+      "step": 422
+    },
+    {
+      "epoch": 423.0,
+      "grad_norm": 0.03676342964172363,
+      "learning_rate": 0.0012822222222222221,
+      "loss": 0.007,
+      "step": 423
+    },
+    {
+      "epoch": 424.0,
+      "grad_norm": 0.03279677405953407,
+      "learning_rate": 0.00128,
+      "loss": 0.0062,
+      "step": 424
+    },
+    {
+      "epoch": 425.0,
+      "grad_norm": 0.03973347321152687,
+      "learning_rate": 0.0012777777777777776,
+      "loss": 0.0066,
+      "step": 425
+    },
+    {
+      "epoch": 426.0,
+      "grad_norm": 0.03546801954507828,
+      "learning_rate": 0.0012755555555555556,
+      "loss": 0.0065,
+      "step": 426
+    },
+    {
+      "epoch": 427.0,
+      "grad_norm": 0.031479015946388245,
+      "learning_rate": 0.0012733333333333333,
+      "loss": 0.0062,
+      "step": 427
+    },
+    {
+      "epoch": 428.0,
+      "grad_norm": 0.033816102892160416,
+      "learning_rate": 0.001271111111111111,
+      "loss": 0.0064,
+      "step": 428
+    },
+    {
+      "epoch": 429.0,
+      "grad_norm": 0.03433229401707649,
+      "learning_rate": 0.001268888888888889,
+      "loss": 0.0067,
+      "step": 429
+    },
+    {
+      "epoch": 430.0,
+      "grad_norm": 0.03628786653280258,
+      "learning_rate": 0.0012666666666666666,
+      "loss": 0.0064,
+      "step": 430
+    },
+    {
+      "epoch": 431.0,
+      "grad_norm": 0.02654869668185711,
+      "learning_rate": 0.0012644444444444446,
+      "loss": 0.006,
+      "step": 431
+    },
+    {
+      "epoch": 432.0,
+      "grad_norm": 0.037869714200496674,
+      "learning_rate": 0.001262222222222222,
+      "loss": 0.0065,
+      "step": 432
+    },
+    {
+      "epoch": 433.0,
+      "grad_norm": 0.04116172716021538,
+      "learning_rate": 0.00126,
+      "loss": 0.0067,
+      "step": 433
+    },
+    {
+      "epoch": 434.0,
+      "grad_norm": 0.036912932991981506,
+      "learning_rate": 0.001257777777777778,
+      "loss": 0.0065,
+      "step": 434
+    },
+    {
+      "epoch": 435.0,
+      "grad_norm": 0.032851189374923706,
+      "learning_rate": 0.0012555555555555555,
+      "loss": 0.0057,
+      "step": 435
+    },
+    {
+      "epoch": 436.0,
+      "grad_norm": 0.03754141926765442,
+      "learning_rate": 0.0012533333333333335,
+      "loss": 0.0064,
+      "step": 436
+    },
+    {
+      "epoch": 437.0,
+      "grad_norm": 0.0499282069504261,
+      "learning_rate": 0.001251111111111111,
+      "loss": 0.0064,
+      "step": 437
+    },
+    {
+      "epoch": 438.0,
+      "grad_norm": 0.03528120741248131,
+      "learning_rate": 0.001248888888888889,
+      "loss": 0.0061,
+      "step": 438
+    },
+    {
+      "epoch": 439.0,
+      "grad_norm": 0.04098303243517876,
+      "learning_rate": 0.0012466666666666665,
+      "loss": 0.0064,
+      "step": 439
+    },
+    {
+      "epoch": 440.0,
+      "grad_norm": 0.05273183062672615,
+      "learning_rate": 0.0012444444444444445,
+      "loss": 0.0059,
+      "step": 440
+    },
+    {
+      "epoch": 441.0,
+      "grad_norm": 0.029608091339468956,
+      "learning_rate": 0.001242222222222222,
+      "loss": 0.0057,
+      "step": 441
+    },
+    {
+      "epoch": 442.0,
+      "grad_norm": 0.03589300811290741,
+      "learning_rate": 0.00124,
+      "loss": 0.0058,
+      "step": 442
+    },
+    {
+      "epoch": 443.0,
+      "grad_norm": 0.03696886822581291,
+      "learning_rate": 0.001237777777777778,
+      "loss": 0.0059,
+      "step": 443
+    },
+    {
+      "epoch": 444.0,
+      "grad_norm": 0.04584373161196709,
+      "learning_rate": 0.0012355555555555555,
+      "loss": 0.0063,
+      "step": 444
+    },
+    {
+      "epoch": 445.0,
+      "grad_norm": 0.03926507383584976,
+      "learning_rate": 0.0012333333333333335,
+      "loss": 0.006,
+      "step": 445
+    },
+    {
+      "epoch": 446.0,
+      "grad_norm": 0.0737149715423584,
+      "learning_rate": 0.001231111111111111,
+      "loss": 0.0065,
+      "step": 446
+    },
+    {
+      "epoch": 447.0,
+      "grad_norm": 0.040384162217378616,
+      "learning_rate": 0.001228888888888889,
+      "loss": 0.006,
+      "step": 447
+    },
+    {
+      "epoch": 448.0,
+      "grad_norm": 0.048789847642183304,
+      "learning_rate": 0.0012266666666666667,
+      "loss": 0.006,
+      "step": 448
+    },
+    {
+      "epoch": 449.0,
+      "grad_norm": 0.04522663727402687,
+      "learning_rate": 0.0012244444444444445,
+      "loss": 0.0064,
+      "step": 449
+    },
+    {
+      "epoch": 450.0,
+      "grad_norm": 0.044181939214468,
+      "learning_rate": 0.0012222222222222224,
+      "loss": 0.0061,
+      "step": 450
+    },
+    {
+      "epoch": 451.0,
+      "grad_norm": 0.04393507167696953,
+      "learning_rate": 0.00122,
+      "loss": 0.0062,
+      "step": 451
+    },
+    {
+      "epoch": 452.0,
+      "grad_norm": 0.0377420112490654,
+      "learning_rate": 0.001217777777777778,
+      "loss": 0.0059,
+      "step": 452
+    },
+    {
+      "epoch": 453.0,
+      "grad_norm": 0.027778957039117813,
+      "learning_rate": 0.0012155555555555554,
+      "loss": 0.0056,
+      "step": 453
+    },
+    {
+      "epoch": 454.0,
+      "grad_norm": 0.03586387261748314,
+      "learning_rate": 0.0012133333333333334,
+      "loss": 0.0059,
+      "step": 454
+    },
+    {
+      "epoch": 455.0,
+      "grad_norm": 0.041379209607839584,
+      "learning_rate": 0.0012111111111111112,
+      "loss": 0.0059,
+      "step": 455
+    },
+    {
+      "epoch": 456.0,
+      "grad_norm": 0.037975508719682693,
+      "learning_rate": 0.001208888888888889,
+      "loss": 0.0061,
+      "step": 456
+    },
+    {
+      "epoch": 457.0,
+      "grad_norm": 0.03542089834809303,
+      "learning_rate": 0.0012066666666666669,
+      "loss": 0.0055,
+      "step": 457
+    },
+    {
+      "epoch": 458.0,
+      "grad_norm": 0.035748131573200226,
+      "learning_rate": 0.0012044444444444444,
+      "loss": 0.006,
+      "step": 458
+    },
+    {
+      "epoch": 459.0,
+      "grad_norm": 0.03112563118338585,
+      "learning_rate": 0.0012022222222222224,
+      "loss": 0.0058,
+      "step": 459
+    },
+    {
+      "epoch": 460.0,
+      "grad_norm": 0.047486674040555954,
+      "learning_rate": 0.0012,
+      "loss": 0.0062,
+      "step": 460
+    },
+    {
+      "epoch": 461.0,
+      "grad_norm": 0.03399772197008133,
+      "learning_rate": 0.0011977777777777779,
+      "loss": 0.0059,
+      "step": 461
+    },
+    {
+      "epoch": 462.0,
+      "grad_norm": 0.043101608753204346,
+      "learning_rate": 0.0011955555555555556,
+      "loss": 0.0059,
+      "step": 462
+    },
+    {
+      "epoch": 463.0,
+      "grad_norm": 0.026961272582411766,
+      "learning_rate": 0.0011933333333333334,
+      "loss": 0.0057,
+      "step": 463
+    },
+    {
+      "epoch": 464.0,
+      "grad_norm": 0.03507644310593605,
+      "learning_rate": 0.001191111111111111,
+      "loss": 0.0059,
+      "step": 464
+    },
+    {
+      "epoch": 465.0,
+      "grad_norm": 0.03533957153558731,
+      "learning_rate": 0.0011888888888888889,
+      "loss": 0.0059,
+      "step": 465
+    },
+    {
+      "epoch": 466.0,
+      "grad_norm": 0.03447294607758522,
+      "learning_rate": 0.0011866666666666668,
+      "loss": 0.0059,
+      "step": 466
+    },
+    {
+      "epoch": 467.0,
+      "grad_norm": 0.03277276083827019,
+      "learning_rate": 0.0011844444444444443,
+      "loss": 0.0061,
+      "step": 467
+    },
+    {
+      "epoch": 468.0,
+      "grad_norm": 0.03529715910553932,
+      "learning_rate": 0.0011822222222222223,
+      "loss": 0.0057,
+      "step": 468
+    },
+    {
+      "epoch": 469.0,
+      "grad_norm": 0.03415314108133316,
+      "learning_rate": 0.00118,
+      "loss": 0.0057,
+      "step": 469
+    },
+    {
+      "epoch": 470.0,
+      "grad_norm": 0.0367075614631176,
+      "learning_rate": 0.0011777777777777778,
+      "loss": 0.0058,
+      "step": 470
+    },
+    {
+      "epoch": 471.0,
+      "grad_norm": 0.04992802441120148,
+      "learning_rate": 0.0011755555555555556,
+      "loss": 0.0058,
+      "step": 471
+    },
+    {
+      "epoch": 472.0,
+      "grad_norm": 0.02544887363910675,
+      "learning_rate": 0.0011733333333333333,
+      "loss": 0.0051,
+      "step": 472
+    },
+    {
+      "epoch": 473.0,
+      "grad_norm": 0.035531774163246155,
+      "learning_rate": 0.0011711111111111113,
+      "loss": 0.0062,
+      "step": 473
+    },
+    {
+      "epoch": 474.0,
+      "grad_norm": 0.02878675051033497,
+      "learning_rate": 0.0011688888888888888,
+      "loss": 0.0052,
+      "step": 474
+    },
+    {
+      "epoch": 475.0,
+      "grad_norm": 0.05629320815205574,
+      "learning_rate": 0.0011666666666666668,
+      "loss": 0.0062,
+      "step": 475
+    },
+    {
+      "epoch": 476.0,
+      "grad_norm": 0.03907129168510437,
+      "learning_rate": 0.0011644444444444445,
+      "loss": 0.0058,
+      "step": 476
+    },
+    {
+      "epoch": 477.0,
+      "grad_norm": 0.05472861975431442,
+      "learning_rate": 0.0011622222222222223,
+      "loss": 0.0056,
+      "step": 477
+    },
+    {
+      "epoch": 478.0,
+      "grad_norm": 0.03535694256424904,
+      "learning_rate": 0.00116,
+      "loss": 0.0054,
+      "step": 478
+    },
+    {
+      "epoch": 479.0,
+      "grad_norm": 0.03546389192342758,
+      "learning_rate": 0.0011577777777777778,
+      "loss": 0.0058,
+      "step": 479
+    },
+    {
+      "epoch": 480.0,
+      "grad_norm": 0.027603119611740112,
+      "learning_rate": 0.0011555555555555555,
+      "loss": 0.0052,
+      "step": 480
+    },
+    {
+      "epoch": 481.0,
+      "grad_norm": 0.03660883754491806,
+      "learning_rate": 0.0011533333333333333,
+      "loss": 0.0057,
+      "step": 481
+    },
+    {
+      "epoch": 482.0,
+      "grad_norm": 0.030513163655996323,
+      "learning_rate": 0.0011511111111111112,
+      "loss": 0.0053,
+      "step": 482
+    },
+    {
+      "epoch": 483.0,
+      "grad_norm": 0.03554394096136093,
+      "learning_rate": 0.001148888888888889,
+      "loss": 0.0057,
+      "step": 483
+    },
+    {
+      "epoch": 484.0,
+      "grad_norm": 0.037891678512096405,
+      "learning_rate": 0.0011466666666666667,
+      "loss": 0.0054,
+      "step": 484
+    },
+    {
+      "epoch": 485.0,
+      "grad_norm": 0.04184143990278244,
+      "learning_rate": 0.0011444444444444445,
+      "loss": 0.0056,
+      "step": 485
+    },
+    {
+      "epoch": 486.0,
+      "grad_norm": 0.03347189724445343,
+      "learning_rate": 0.0011422222222222222,
+      "loss": 0.0054,
+      "step": 486
+    },
+    {
+      "epoch": 487.0,
+      "grad_norm": 0.03891591727733612,
+      "learning_rate": 0.00114,
+      "loss": 0.0053,
+      "step": 487
+    },
+    {
+      "epoch": 488.0,
+      "grad_norm": 0.030163027346134186,
+      "learning_rate": 0.0011377777777777777,
+      "loss": 0.0054,
+      "step": 488
+    },
+    {
+      "epoch": 489.0,
+      "grad_norm": 0.03170597180724144,
+      "learning_rate": 0.0011355555555555557,
+      "loss": 0.0053,
+      "step": 489
+    },
+    {
+      "epoch": 490.0,
+      "grad_norm": 0.027653338387608528,
+      "learning_rate": 0.0011333333333333334,
+      "loss": 0.0049,
+      "step": 490
+    },
+    {
+      "epoch": 491.0,
+      "grad_norm": 0.025576811283826828,
+      "learning_rate": 0.0011311111111111112,
+      "loss": 0.0049,
+      "step": 491
+    },
+    {
+      "epoch": 492.0,
+      "grad_norm": 0.02671181410551071,
+      "learning_rate": 0.001128888888888889,
+      "loss": 0.0051,
+      "step": 492
+    },
+    {
+      "epoch": 493.0,
+      "grad_norm": 0.031090857461094856,
+      "learning_rate": 0.0011266666666666667,
+      "loss": 0.0054,
+      "step": 493
+    },
+    {
+      "epoch": 494.0,
+      "grad_norm": 0.030311353504657745,
+      "learning_rate": 0.0011244444444444444,
+      "loss": 0.0053,
+      "step": 494
+    },
+    {
+      "epoch": 495.0,
+      "grad_norm": 0.03606722131371498,
+      "learning_rate": 0.0011222222222222222,
+      "loss": 0.0053,
+      "step": 495
+    },
+    {
+      "epoch": 496.0,
+      "grad_norm": 0.035778749734163284,
+      "learning_rate": 0.0011200000000000001,
+      "loss": 0.0053,
+      "step": 496
+    },
+    {
+      "epoch": 497.0,
+      "grad_norm": 0.03796238452196121,
+      "learning_rate": 0.0011177777777777779,
+      "loss": 0.0054,
+      "step": 497
+    },
+    {
+      "epoch": 498.0,
+      "grad_norm": 0.02831469103693962,
+      "learning_rate": 0.0011155555555555556,
+      "loss": 0.005,
+      "step": 498
+    },
+    {
+      "epoch": 499.0,
+      "grad_norm": 0.0282357819378376,
+      "learning_rate": 0.0011133333333333334,
+      "loss": 0.005,
+      "step": 499
+    },
+    {
+      "epoch": 500.0,
+      "grad_norm": 0.04182511568069458,
+      "learning_rate": 0.0011111111111111111,
+      "loss": 0.0056,
+      "step": 500
+    },
+    {
+      "epoch": 501.0,
+      "grad_norm": 0.02399536222219467,
+      "learning_rate": 0.0011088888888888889,
+      "loss": 0.0047,
+      "step": 501
+    },
+    {
+      "epoch": 502.0,
+      "grad_norm": 0.033601414412260056,
+      "learning_rate": 0.0011066666666666666,
+      "loss": 0.0051,
+      "step": 502
+    },
+    {
+      "epoch": 503.0,
+      "grad_norm": 0.033893782645463943,
+      "learning_rate": 0.0011044444444444444,
+      "loss": 0.0055,
+      "step": 503
+    },
+    {
+      "epoch": 504.0,
+      "grad_norm": 0.030596988275647163,
+      "learning_rate": 0.0011022222222222223,
+      "loss": 0.0048,
+      "step": 504
+    },
+    {
+      "epoch": 505.0,
+      "grad_norm": 0.03259752318263054,
+      "learning_rate": 0.0011,
+      "loss": 0.0051,
+      "step": 505
+    },
+    {
+      "epoch": 506.0,
+      "grad_norm": 0.02722361497581005,
+      "learning_rate": 0.0010977777777777778,
+      "loss": 0.0045,
+      "step": 506
+    },
+    {
+      "epoch": 507.0,
+      "grad_norm": 0.03016485832631588,
+      "learning_rate": 0.0010955555555555556,
+      "loss": 0.005,
+      "step": 507
+    },
+    {
+      "epoch": 508.0,
+      "grad_norm": 0.02929559536278248,
+      "learning_rate": 0.0010933333333333333,
+      "loss": 0.0049,
+      "step": 508
+    },
+    {
+      "epoch": 509.0,
+      "grad_norm": 0.041284140199422836,
+      "learning_rate": 0.001091111111111111,
+      "loss": 0.005,
+      "step": 509
+    },
+    {
+      "epoch": 510.0,
+      "grad_norm": 0.026360472664237022,
+      "learning_rate": 0.0010888888888888888,
+      "loss": 0.005,
+      "step": 510
+    },
+    {
+      "epoch": 511.0,
+      "grad_norm": 0.03568575158715248,
+      "learning_rate": 0.0010866666666666668,
+      "loss": 0.0048,
+      "step": 511
+    },
+    {
+      "epoch": 512.0,
+      "grad_norm": 0.030765000730752945,
+      "learning_rate": 0.0010844444444444445,
+      "loss": 0.0048,
+      "step": 512
+    },
+    {
+      "epoch": 513.0,
+      "grad_norm": 0.032683007419109344,
+      "learning_rate": 0.0010822222222222223,
+      "loss": 0.0052,
+      "step": 513
+    },
+    {
+      "epoch": 514.0,
+      "grad_norm": 0.025469932705163956,
+      "learning_rate": 0.00108,
+      "loss": 0.0046,
+      "step": 514
+    },
+    {
+      "epoch": 515.0,
+      "grad_norm": 0.0416124053299427,
+      "learning_rate": 0.0010777777777777778,
+      "loss": 0.0051,
+      "step": 515
+    },
+    {
+      "epoch": 516.0,
+      "grad_norm": 0.03848906606435776,
+      "learning_rate": 0.0010755555555555557,
+      "loss": 0.0052,
+      "step": 516
+    },
+    {
+      "epoch": 517.0,
+      "grad_norm": 0.04426151141524315,
+      "learning_rate": 0.0010733333333333333,
+      "loss": 0.0052,
+      "step": 517
+    },
+    {
+      "epoch": 518.0,
+      "grad_norm": 0.03802550211548805,
+      "learning_rate": 0.0010711111111111112,
+      "loss": 0.0049,
+      "step": 518
+    },
+    {
+      "epoch": 519.0,
+      "grad_norm": 0.025775237008929253,
+      "learning_rate": 0.001068888888888889,
+      "loss": 0.0049,
+      "step": 519
+    },
+    {
+      "epoch": 520.0,
+      "grad_norm": 0.04428073391318321,
+      "learning_rate": 0.0010666666666666667,
+      "loss": 0.0052,
+      "step": 520
+    },
+    {
+      "epoch": 521.0,
+      "grad_norm": 0.033617082983255386,
+      "learning_rate": 0.0010644444444444445,
+      "loss": 0.0051,
+      "step": 521
+    },
+    {
+      "epoch": 522.0,
+      "grad_norm": 0.033705078065395355,
+      "learning_rate": 0.0010622222222222222,
+      "loss": 0.0048,
+      "step": 522
+    },
+    {
+      "epoch": 523.0,
+      "grad_norm": 0.04792787879705429,
+      "learning_rate": 0.0010600000000000002,
+      "loss": 0.0049,
+      "step": 523
+    },
+    {
+      "epoch": 524.0,
+      "grad_norm": 0.02900075912475586,
+      "learning_rate": 0.0010577777777777777,
+      "loss": 0.0047,
+      "step": 524
+    },
+    {
+      "epoch": 525.0,
+      "grad_norm": 0.054417647421360016,
+      "learning_rate": 0.0010555555555555557,
+      "loss": 0.0054,
+      "step": 525
+    },
+    {
+      "epoch": 526.0,
+      "grad_norm": 0.03227232024073601,
+      "learning_rate": 0.0010533333333333332,
+      "loss": 0.005,
+      "step": 526
+    },
+    {
+      "epoch": 527.0,
+      "grad_norm": 0.03752639517188072,
+      "learning_rate": 0.0010511111111111112,
+      "loss": 0.0053,
+      "step": 527
+    },
+    {
+      "epoch": 528.0,
+      "grad_norm": 0.029628194868564606,
+      "learning_rate": 0.001048888888888889,
+      "loss": 0.0049,
+      "step": 528
+    },
+    {
+      "epoch": 529.0,
+      "grad_norm": 0.03387615829706192,
+      "learning_rate": 0.0010466666666666667,
+      "loss": 0.005,
+      "step": 529
+    },
+    {
+      "epoch": 530.0,
+      "grad_norm": 0.033868737518787384,
+      "learning_rate": 0.0010444444444444446,
+      "loss": 0.0048,
+      "step": 530
+    },
+    {
+      "epoch": 531.0,
+      "grad_norm": 0.035898059606552124,
+      "learning_rate": 0.0010422222222222222,
+      "loss": 0.005,
+      "step": 531
+    },
+    {
+      "epoch": 532.0,
+      "grad_norm": 0.040057726204395294,
+      "learning_rate": 0.0010400000000000001,
+      "loss": 0.005,
+      "step": 532
+    },
+    {
+      "epoch": 533.0,
+      "grad_norm": 0.03613459691405296,
+      "learning_rate": 0.0010377777777777777,
+      "loss": 0.005,
+      "step": 533
+    },
+    {
+      "epoch": 534.0,
+      "grad_norm": 0.034286290407180786,
+      "learning_rate": 0.0010355555555555556,
+      "loss": 0.005,
+      "step": 534
+    },
+    {
+      "epoch": 535.0,
+      "grad_norm": 0.040484048426151276,
+      "learning_rate": 0.0010333333333333334,
+      "loss": 0.005,
+      "step": 535
+    },
+    {
+      "epoch": 536.0,
+      "grad_norm": 0.03549760952591896,
+      "learning_rate": 0.0010311111111111111,
+      "loss": 0.0049,
+      "step": 536
+    },
+    {
+      "epoch": 537.0,
+      "grad_norm": 0.03199818730354309,
+      "learning_rate": 0.001028888888888889,
+      "loss": 0.0048,
+      "step": 537
+    },
+    {
+      "epoch": 538.0,
+      "grad_norm": 0.045031916350126266,
+      "learning_rate": 0.0010266666666666666,
+      "loss": 0.005,
+      "step": 538
+    },
+    {
+      "epoch": 539.0,
+      "grad_norm": 0.03412579745054245,
+      "learning_rate": 0.0010244444444444446,
+      "loss": 0.0051,
+      "step": 539
+    },
+    {
+      "epoch": 540.0,
+      "grad_norm": 0.02991371974349022,
+      "learning_rate": 0.0010222222222222221,
+      "loss": 0.0046,
+      "step": 540
+    },
+    {
+      "epoch": 541.0,
+      "grad_norm": 0.025920365005731583,
+      "learning_rate": 0.00102,
+      "loss": 0.0044,
+      "step": 541
+    },
+    {
+      "epoch": 542.0,
+      "grad_norm": 0.04434429481625557,
+      "learning_rate": 0.0010177777777777776,
+      "loss": 0.005,
+      "step": 542
+    },
+    {
+      "epoch": 543.0,
+      "grad_norm": 0.03925777226686478,
+      "learning_rate": 0.0010155555555555556,
+      "loss": 0.005,
+      "step": 543
+    },
+    {
+      "epoch": 544.0,
+      "grad_norm": 0.028992939740419388,
+      "learning_rate": 0.0010133333333333335,
+      "loss": 0.0044,
+      "step": 544
+    },
+    {
+      "epoch": 545.0,
+      "grad_norm": 0.050765953958034515,
+      "learning_rate": 0.001011111111111111,
+      "loss": 0.005,
+      "step": 545
+    },
+    {
+      "epoch": 546.0,
+      "grad_norm": 0.0336458683013916,
+      "learning_rate": 0.001008888888888889,
+      "loss": 0.0047,
+      "step": 546
+    },
+    {
+      "epoch": 547.0,
+      "grad_norm": 0.03314169868826866,
+      "learning_rate": 0.0010066666666666666,
+      "loss": 0.0047,
+      "step": 547
+    },
+    {
+      "epoch": 548.0,
+      "grad_norm": 0.048472099006175995,
+      "learning_rate": 0.0010044444444444445,
+      "loss": 0.0049,
+      "step": 548
+    },
+    {
+      "epoch": 549.0,
+      "grad_norm": 0.03656391799449921,
+      "learning_rate": 0.001002222222222222,
+      "loss": 0.0049,
+      "step": 549
+    },
+    {
+      "epoch": 550.0,
+      "grad_norm": 0.046296313405036926,
+      "learning_rate": 0.001,
+      "loss": 0.0049,
+      "step": 550
+    },
+    {
+      "epoch": 551.0,
+      "grad_norm": 0.028759324923157692,
+      "learning_rate": 0.0009977777777777778,
+      "loss": 0.0047,
+      "step": 551
+    },
+    {
+      "epoch": 552.0,
+      "grad_norm": 0.031202584505081177,
+      "learning_rate": 0.0009955555555555555,
+      "loss": 0.0048,
+      "step": 552
+    },
+    {
+      "epoch": 553.0,
+      "grad_norm": 0.040073346346616745,
+      "learning_rate": 0.0009933333333333333,
+      "loss": 0.005,
+      "step": 553
+    },
+    {
+      "epoch": 554.0,
+      "grad_norm": 0.05222450569272041,
+      "learning_rate": 0.0009911111111111112,
+      "loss": 0.0047,
+      "step": 554
+    },
+    {
+      "epoch": 555.0,
+      "grad_norm": 0.03518804907798767,
+      "learning_rate": 0.000988888888888889,
+      "loss": 0.0046,
+      "step": 555
+    },
+    {
+      "epoch": 556.0,
+      "grad_norm": 0.04386880621314049,
+      "learning_rate": 0.0009866666666666667,
+      "loss": 0.0049,
+      "step": 556
+    },
+    {
+      "epoch": 557.0,
+      "grad_norm": 0.034030430018901825,
+      "learning_rate": 0.0009844444444444445,
+      "loss": 0.0049,
+      "step": 557
+    },
+    {
+      "epoch": 558.0,
+      "grad_norm": 0.024780094623565674,
+      "learning_rate": 0.0009822222222222222,
+      "loss": 0.0046,
+      "step": 558
+    },
+    {
+      "epoch": 559.0,
+      "grad_norm": 0.027013640850782394,
+      "learning_rate": 0.00098,
+      "loss": 0.0046,
+      "step": 559
+    },
+    {
+      "epoch": 560.0,
+      "grad_norm": 0.030160361900925636,
+      "learning_rate": 0.0009777777777777777,
+      "loss": 0.0043,
+      "step": 560
+    },
+    {
+      "epoch": 561.0,
+      "grad_norm": 0.034773167222738266,
+      "learning_rate": 0.0009755555555555556,
+      "loss": 0.005,
+      "step": 561
+    },
+    {
+      "epoch": 562.0,
+      "grad_norm": 0.031707510352134705,
+      "learning_rate": 0.0009733333333333334,
+      "loss": 0.0046,
+      "step": 562
+    },
+    {
+      "epoch": 563.0,
+      "grad_norm": 0.028600016608834267,
+      "learning_rate": 0.0009711111111111112,
+      "loss": 0.0044,
+      "step": 563
+    },
+    {
+      "epoch": 564.0,
+      "grad_norm": 0.04488474503159523,
+      "learning_rate": 0.0009688888888888889,
+      "loss": 0.0046,
+      "step": 564
+    },
+    {
+      "epoch": 565.0,
+      "grad_norm": 0.027938274666666985,
+      "learning_rate": 0.0009666666666666667,
+      "loss": 0.0045,
+      "step": 565
+    },
+    {
+      "epoch": 566.0,
+      "grad_norm": 0.036352016031742096,
+      "learning_rate": 0.0009644444444444444,
+      "loss": 0.0045,
+      "step": 566
+    },
+    {
+      "epoch": 567.0,
+      "grad_norm": 0.03465161472558975,
+      "learning_rate": 0.0009622222222222222,
+      "loss": 0.0045,
+      "step": 567
+    },
+    {
+      "epoch": 568.0,
+      "grad_norm": 0.0352412685751915,
+      "learning_rate": 0.00096,
+      "loss": 0.0047,
+      "step": 568
+    },
+    {
+      "epoch": 569.0,
+      "grad_norm": 0.03907673805952072,
+      "learning_rate": 0.0009577777777777778,
+      "loss": 0.0049,
+      "step": 569
+    },
+    {
+      "epoch": 570.0,
+      "grad_norm": 0.0324881337583065,
+      "learning_rate": 0.0009555555555555556,
+      "loss": 0.0045,
+      "step": 570
+    },
+    {
+      "epoch": 571.0,
+      "grad_norm": 0.03322600945830345,
+      "learning_rate": 0.0009533333333333334,
+      "loss": 0.0046,
+      "step": 571
+    },
+    {
+      "epoch": 572.0,
+      "grad_norm": 0.04286476597189903,
+      "learning_rate": 0.0009511111111111111,
+      "loss": 0.0045,
+      "step": 572
+    },
+    {
+      "epoch": 573.0,
+      "grad_norm": 0.038839150220155716,
+      "learning_rate": 0.0009488888888888889,
+      "loss": 0.0046,
+      "step": 573
+    },
+    {
+      "epoch": 574.0,
+      "grad_norm": 0.027261720970273018,
+      "learning_rate": 0.0009466666666666667,
+      "loss": 0.0043,
+      "step": 574
+    },
+    {
+      "epoch": 575.0,
+      "grad_norm": 0.0263065192848444,
+      "learning_rate": 0.0009444444444444445,
+      "loss": 0.0041,
+      "step": 575
+    },
+    {
+      "epoch": 576.0,
+      "grad_norm": 0.03773610666394234,
+      "learning_rate": 0.0009422222222222222,
+      "loss": 0.0048,
+      "step": 576
+    },
+    {
+      "epoch": 577.0,
+      "grad_norm": 0.03849232941865921,
+      "learning_rate": 0.00094,
+      "loss": 0.005,
+      "step": 577
+    },
+    {
+      "epoch": 578.0,
+      "grad_norm": 0.04704838618636131,
+      "learning_rate": 0.0009377777777777778,
+      "loss": 0.0049,
+      "step": 578
+    },
+    {
+      "epoch": 579.0,
+      "grad_norm": 0.029075607657432556,
+      "learning_rate": 0.0009355555555555556,
+      "loss": 0.0044,
+      "step": 579
+    },
+    {
+      "epoch": 580.0,
+      "grad_norm": 0.026013720780611038,
+      "learning_rate": 0.0009333333333333333,
+      "loss": 0.0043,
+      "step": 580
+    },
+    {
+      "epoch": 581.0,
+      "grad_norm": 0.03620687872171402,
+      "learning_rate": 0.0009311111111111112,
+      "loss": 0.0046,
+      "step": 581
+    },
+    {
+      "epoch": 582.0,
+      "grad_norm": 0.03129187971353531,
+      "learning_rate": 0.0009288888888888889,
+      "loss": 0.0044,
+      "step": 582
+    },
+    {
+      "epoch": 583.0,
+      "grad_norm": 0.022100580856204033,
+      "learning_rate": 0.0009266666666666667,
+      "loss": 0.004,
+      "step": 583
+    },
+    {
+      "epoch": 584.0,
+      "grad_norm": 0.03407726436853409,
+      "learning_rate": 0.0009244444444444444,
+      "loss": 0.0045,
+      "step": 584
+    },
+    {
+      "epoch": 585.0,
+      "grad_norm": 0.030179621651768684,
+      "learning_rate": 0.0009222222222222223,
+      "loss": 0.0045,
+      "step": 585
+    },
+    {
+      "epoch": 586.0,
+      "grad_norm": 0.03250374644994736,
+      "learning_rate": 0.00092,
+      "loss": 0.0048,
+      "step": 586
+    },
+    {
+      "epoch": 587.0,
+      "grad_norm": 0.030906520783901215,
+      "learning_rate": 0.0009177777777777778,
+      "loss": 0.0043,
+      "step": 587
+    },
+    {
+      "epoch": 588.0,
+      "grad_norm": 0.03283608704805374,
+      "learning_rate": 0.0009155555555555556,
+      "loss": 0.0044,
+      "step": 588
+    },
+    {
+      "epoch": 589.0,
+      "grad_norm": 0.029278622940182686,
+      "learning_rate": 0.0009133333333333334,
+      "loss": 0.0044,
+      "step": 589
+    },
+    {
+      "epoch": 590.0,
+      "grad_norm": 0.02297147922217846,
+      "learning_rate": 0.0009111111111111111,
+      "loss": 0.0041,
+      "step": 590
+    },
+    {
+      "epoch": 591.0,
+      "grad_norm": 0.023177258670330048,
+      "learning_rate": 0.0009088888888888889,
+      "loss": 0.0039,
+      "step": 591
+    },
+    {
+      "epoch": 592.0,
+      "grad_norm": 0.030108338221907616,
+      "learning_rate": 0.0009066666666666666,
+      "loss": 0.0043,
+      "step": 592
+    },
+    {
+      "epoch": 593.0,
+      "grad_norm": 0.02184971235692501,
+      "learning_rate": 0.0009044444444444445,
+      "loss": 0.0038,
+      "step": 593
+    },
+    {
+      "epoch": 594.0,
+      "grad_norm": 0.027614466845989227,
+      "learning_rate": 0.0009022222222222222,
+      "loss": 0.0041,
+      "step": 594
+    },
+    {
+      "epoch": 595.0,
+      "grad_norm": 0.038632676005363464,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 0.0044,
+      "step": 595
+    },
+    {
+      "epoch": 596.0,
+      "grad_norm": 0.032530948519706726,
+      "learning_rate": 0.0008977777777777778,
+      "loss": 0.0043,
+      "step": 596
+    },
+    {
+      "epoch": 597.0,
+      "grad_norm": 0.028832774609327316,
+      "learning_rate": 0.0008955555555555556,
+      "loss": 0.0044,
+      "step": 597
+    },
+    {
+      "epoch": 598.0,
+      "grad_norm": 0.02461000345647335,
+      "learning_rate": 0.0008933333333333333,
+      "loss": 0.0042,
+      "step": 598
+    },
+    {
+      "epoch": 599.0,
+      "grad_norm": 0.028294360265135765,
+      "learning_rate": 0.0008911111111111111,
+      "loss": 0.0043,
+      "step": 599
+    },
+    {
+      "epoch": 600.0,
+      "grad_norm": 0.023488713428378105,
+      "learning_rate": 0.0008888888888888888,
+      "loss": 0.004,
+      "step": 600
+    },
+    {
+      "epoch": 601.0,
+      "grad_norm": 0.03520805388689041,
+      "learning_rate": 0.0008866666666666667,
+      "loss": 0.0038,
+      "step": 601
+    },
+    {
+      "epoch": 602.0,
+      "grad_norm": 0.03618593141436577,
+      "learning_rate": 0.0008844444444444445,
+      "loss": 0.0045,
+      "step": 602
+    },
+    {
+      "epoch": 603.0,
+      "grad_norm": 0.02912176214158535,
+      "learning_rate": 0.0008822222222222223,
+      "loss": 0.0042,
+      "step": 603
+    },
+    {
+      "epoch": 604.0,
+      "grad_norm": 0.02343122847378254,
+      "learning_rate": 0.00088,
+      "loss": 0.0038,
+      "step": 604
+    },
+    {
+      "epoch": 605.0,
+      "grad_norm": 0.032638318836688995,
+      "learning_rate": 0.0008777777777777778,
+      "loss": 0.0041,
+      "step": 605
+    },
+    {
+      "epoch": 606.0,
+      "grad_norm": 0.03873821347951889,
+      "learning_rate": 0.0008755555555555555,
+      "loss": 0.0045,
+      "step": 606
+    },
+    {
+      "epoch": 607.0,
+      "grad_norm": 0.04660570248961449,
+      "learning_rate": 0.0008733333333333333,
+      "loss": 0.0044,
+      "step": 607
+    },
+    {
+      "epoch": 608.0,
+      "grad_norm": 0.03421563282608986,
+      "learning_rate": 0.000871111111111111,
+      "loss": 0.004,
+      "step": 608
+    },
+    {
+      "epoch": 609.0,
+      "grad_norm": 0.02049202285706997,
+      "learning_rate": 0.000868888888888889,
+      "loss": 0.0038,
+      "step": 609
+    },
+    {
+      "epoch": 610.0,
+      "grad_norm": 0.0303183626383543,
+      "learning_rate": 0.0008666666666666667,
+      "loss": 0.004,
+      "step": 610
+    },
+    {
+      "epoch": 611.0,
+      "grad_norm": 0.03326858580112457,
+      "learning_rate": 0.0008644444444444445,
+      "loss": 0.004,
+      "step": 611
+    },
+    {
+      "epoch": 612.0,
+      "grad_norm": 0.030689280480146408,
+      "learning_rate": 0.0008622222222222222,
+      "loss": 0.0041,
+      "step": 612
+    },
+    {
+      "epoch": 613.0,
+      "grad_norm": 0.029986605048179626,
+      "learning_rate": 0.00086,
+      "loss": 0.0041,
+      "step": 613
+    },
+    {
+      "epoch": 614.0,
+      "grad_norm": 0.02606850303709507,
+      "learning_rate": 0.0008577777777777777,
+      "loss": 0.004,
+      "step": 614
+    },
+    {
+      "epoch": 615.0,
+      "grad_norm": 0.034229837357997894,
+      "learning_rate": 0.0008555555555555556,
+      "loss": 0.004,
+      "step": 615
+    },
+    {
+      "epoch": 616.0,
+      "grad_norm": 0.029757648706436157,
+      "learning_rate": 0.0008533333333333334,
+      "loss": 0.0043,
+      "step": 616
+    },
+    {
+      "epoch": 617.0,
+      "grad_norm": 0.03645173832774162,
+      "learning_rate": 0.0008511111111111112,
+      "loss": 0.0044,
+      "step": 617
+    },
+    {
+      "epoch": 618.0,
+      "grad_norm": 0.03808034211397171,
+      "learning_rate": 0.0008488888888888889,
+      "loss": 0.0041,
+      "step": 618
+    },
+    {
+      "epoch": 619.0,
+      "grad_norm": 0.02224326692521572,
+      "learning_rate": 0.0008466666666666667,
+      "loss": 0.0038,
+      "step": 619
+    },
+    {
+      "epoch": 620.0,
+      "grad_norm": 0.03940601274371147,
+      "learning_rate": 0.0008444444444444444,
+      "loss": 0.0039,
+      "step": 620
+    },
+    {
+      "epoch": 621.0,
+      "grad_norm": 0.049832046031951904,
+      "learning_rate": 0.0008422222222222222,
+      "loss": 0.0044,
+      "step": 621
+    },
+    {
+      "epoch": 622.0,
+      "grad_norm": 0.022758588194847107,
+      "learning_rate": 0.00084,
+      "loss": 0.0039,
+      "step": 622
+    },
+    {
+      "epoch": 623.0,
+      "grad_norm": 0.049774039536714554,
+      "learning_rate": 0.0008377777777777778,
+      "loss": 0.0045,
+      "step": 623
+    },
+    {
+      "epoch": 624.0,
+      "grad_norm": 0.03903564065694809,
+      "learning_rate": 0.0008355555555555556,
+      "loss": 0.0042,
+      "step": 624
+    },
+    {
+      "epoch": 625.0,
+      "grad_norm": 0.029349060729146004,
+      "learning_rate": 0.0008333333333333334,
+      "loss": 0.0042,
+      "step": 625
+    },
+    {
+      "epoch": 626.0,
+      "grad_norm": 0.034631963819265366,
+      "learning_rate": 0.0008311111111111111,
+      "loss": 0.0043,
+      "step": 626
+    },
+    {
+      "epoch": 627.0,
+      "grad_norm": 0.04115751013159752,
+      "learning_rate": 0.0008288888888888889,
+      "loss": 0.0044,
+      "step": 627
+    },
+    {
+      "epoch": 628.0,
+      "grad_norm": 0.03835492208600044,
+      "learning_rate": 0.0008266666666666666,
+      "loss": 0.0038,
+      "step": 628
+    },
+    {
+      "epoch": 629.0,
+      "grad_norm": 0.027649203315377235,
+      "learning_rate": 0.0008244444444444445,
+      "loss": 0.0042,
+      "step": 629
+    },
+    {
+      "epoch": 630.0,
+      "grad_norm": 0.028530064970254898,
+      "learning_rate": 0.0008222222222222222,
+      "loss": 0.0041,
+      "step": 630
+    },
+    {
+      "epoch": 631.0,
+      "grad_norm": 0.029089566320180893,
+      "learning_rate": 0.00082,
+      "loss": 0.0038,
+      "step": 631
+    },
+    {
+      "epoch": 632.0,
+      "grad_norm": 0.04255674034357071,
+      "learning_rate": 0.0008177777777777778,
+      "loss": 0.0042,
+      "step": 632
+    },
+    {
+      "epoch": 633.0,
+      "grad_norm": 0.024558911100029945,
+      "learning_rate": 0.0008155555555555556,
+      "loss": 0.0039,
+      "step": 633
+    },
+    {
+      "epoch": 634.0,
+      "grad_norm": 0.031170202419161797,
+      "learning_rate": 0.0008133333333333333,
+      "loss": 0.0041,
+      "step": 634
+    },
+    {
+      "epoch": 635.0,
+      "grad_norm": 0.029109520837664604,
+      "learning_rate": 0.0008111111111111111,
+      "loss": 0.0038,
+      "step": 635
+    },
+    {
+      "epoch": 636.0,
+      "grad_norm": 0.02705140970647335,
+      "learning_rate": 0.0008088888888888889,
+      "loss": 0.0038,
+      "step": 636
+    },
+    {
+      "epoch": 637.0,
+      "grad_norm": 0.023048389703035355,
+      "learning_rate": 0.0008066666666666667,
+      "loss": 0.0038,
+      "step": 637
+    },
+    {
+      "epoch": 638.0,
+      "grad_norm": 0.02259679324924946,
+      "learning_rate": 0.0008044444444444444,
+      "loss": 0.0036,
+      "step": 638
+    },
+    {
+      "epoch": 639.0,
+      "grad_norm": 0.029809147119522095,
+      "learning_rate": 0.0008022222222222222,
+      "loss": 0.0039,
+      "step": 639
+    },
+    {
+      "epoch": 640.0,
+      "grad_norm": 0.028109390288591385,
+      "learning_rate": 0.0008,
+      "loss": 0.0038,
+      "step": 640
+    },
+    {
+      "epoch": 641.0,
+      "grad_norm": 0.04414813220500946,
+      "learning_rate": 0.0007977777777777778,
+      "loss": 0.0042,
+      "step": 641
+    },
+    {
+      "epoch": 642.0,
+      "grad_norm": 0.030944792553782463,
+      "learning_rate": 0.0007955555555555555,
+      "loss": 0.0038,
+      "step": 642
+    },
+    {
+      "epoch": 643.0,
+      "grad_norm": 0.023884311318397522,
+      "learning_rate": 0.0007933333333333334,
+      "loss": 0.0036,
+      "step": 643
+    },
+    {
+      "epoch": 644.0,
+      "grad_norm": 0.024393659085035324,
+      "learning_rate": 0.0007911111111111111,
+      "loss": 0.0038,
+      "step": 644
+    },
+    {
+      "epoch": 645.0,
+      "grad_norm": 0.02105238474905491,
+      "learning_rate": 0.0007888888888888889,
+      "loss": 0.0036,
+      "step": 645
+    },
+    {
+      "epoch": 646.0,
+      "grad_norm": 0.02686360850930214,
+      "learning_rate": 0.0007866666666666666,
+      "loss": 0.0039,
+      "step": 646
+    },
+    {
+      "epoch": 647.0,
+      "grad_norm": 0.03692441061139107,
+      "learning_rate": 0.0007844444444444445,
+      "loss": 0.0042,
+      "step": 647
+    },
+    {
+      "epoch": 648.0,
+      "grad_norm": 0.026083093136548996,
+      "learning_rate": 0.0007822222222222222,
+      "loss": 0.0037,
+      "step": 648
+    },
+    {
+      "epoch": 649.0,
+      "grad_norm": 0.021342594176530838,
+      "learning_rate": 0.0007800000000000001,
+      "loss": 0.0036,
+      "step": 649
+    },
+    {
+      "epoch": 650.0,
+      "grad_norm": 0.024524180218577385,
+      "learning_rate": 0.0007777777777777778,
+      "loss": 0.0039,
+      "step": 650
+    },
+    {
+      "epoch": 651.0,
+      "grad_norm": 0.028572745621204376,
+      "learning_rate": 0.0007755555555555556,
+      "loss": 0.0038,
+      "step": 651
+    },
+    {
+      "epoch": 652.0,
+      "grad_norm": 0.043239813297986984,
+      "learning_rate": 0.0007733333333333333,
+      "loss": 0.0041,
+      "step": 652
+    },
+    {
+      "epoch": 653.0,
+      "grad_norm": 0.021039173007011414,
+      "learning_rate": 0.0007711111111111111,
+      "loss": 0.0036,
+      "step": 653
+    },
+    {
+      "epoch": 654.0,
+      "grad_norm": 0.0256752148270607,
+      "learning_rate": 0.0007688888888888888,
+      "loss": 0.0037,
+      "step": 654
+    },
+    {
+      "epoch": 655.0,
+      "grad_norm": 0.033442508429288864,
+      "learning_rate": 0.0007666666666666667,
+      "loss": 0.0041,
+      "step": 655
+    },
+    {
+      "epoch": 656.0,
+      "grad_norm": 0.023301295936107635,
+      "learning_rate": 0.0007644444444444445,
+      "loss": 0.0038,
+      "step": 656
+    },
+    {
+      "epoch": 657.0,
+      "grad_norm": 0.02558085508644581,
+      "learning_rate": 0.0007622222222222223,
+      "loss": 0.0039,
+      "step": 657
+    },
+    {
+      "epoch": 658.0,
+      "grad_norm": 0.02924611233174801,
+      "learning_rate": 0.00076,
+      "loss": 0.0038,
+      "step": 658
+    },
+    {
+      "epoch": 659.0,
+      "grad_norm": 0.02991502359509468,
+      "learning_rate": 0.0007577777777777778,
+      "loss": 0.0035,
+      "step": 659
+    },
+    {
+      "epoch": 660.0,
+      "grad_norm": 0.03482845798134804,
+      "learning_rate": 0.0007555555555555555,
+      "loss": 0.004,
+      "step": 660
+    },
+    {
+      "epoch": 661.0,
+      "grad_norm": 0.02580106444656849,
+      "learning_rate": 0.0007533333333333333,
+      "loss": 0.0036,
+      "step": 661
+    },
+    {
+      "epoch": 662.0,
+      "grad_norm": 0.020824357867240906,
+      "learning_rate": 0.000751111111111111,
+      "loss": 0.0033,
+      "step": 662
+    },
+    {
+      "epoch": 663.0,
+      "grad_norm": 0.02536211535334587,
+      "learning_rate": 0.000748888888888889,
+      "loss": 0.0037,
+      "step": 663
+    },
+    {
+      "epoch": 664.0,
+      "grad_norm": 0.05054183304309845,
+      "learning_rate": 0.0007466666666666667,
+      "loss": 0.0045,
+      "step": 664
+    },
+    {
+      "epoch": 665.0,
+      "grad_norm": 0.022909749299287796,
+      "learning_rate": 0.0007444444444444445,
+      "loss": 0.0036,
+      "step": 665
+    },
+    {
+      "epoch": 666.0,
+      "grad_norm": 0.027930857613682747,
+      "learning_rate": 0.0007422222222222222,
+      "loss": 0.0038,
+      "step": 666
+    },
+    {
+      "epoch": 667.0,
+      "grad_norm": 0.022380666807293892,
+      "learning_rate": 0.00074,
+      "loss": 0.0036,
+      "step": 667
+    },
+    {
+      "epoch": 668.0,
+      "grad_norm": 0.032786887139081955,
+      "learning_rate": 0.0007377777777777777,
+      "loss": 0.0036,
+      "step": 668
+    },
+    {
+      "epoch": 669.0,
+      "grad_norm": 0.03477782383561134,
+      "learning_rate": 0.0007355555555555555,
+      "loss": 0.0041,
+      "step": 669
+    },
+    {
+      "epoch": 670.0,
+      "grad_norm": 0.029481414705514908,
+      "learning_rate": 0.0007333333333333333,
+      "loss": 0.0038,
+      "step": 670
+    },
+    {
+      "epoch": 671.0,
+      "grad_norm": 0.039676692336797714,
+      "learning_rate": 0.0007311111111111112,
+      "loss": 0.0039,
+      "step": 671
+    },
+    {
+      "epoch": 672.0,
+      "grad_norm": 0.049726180732250214,
+      "learning_rate": 0.0007288888888888889,
+      "loss": 0.0042,
+      "step": 672
+    },
+    {
+      "epoch": 673.0,
+      "grad_norm": 0.02737841010093689,
+      "learning_rate": 0.0007266666666666667,
+      "loss": 0.0036,
+      "step": 673
+    },
+    {
+      "epoch": 674.0,
+      "grad_norm": 0.02903469279408455,
+      "learning_rate": 0.0007244444444444444,
+      "loss": 0.0037,
+      "step": 674
+    },
+    {
+      "epoch": 675.0,
+      "grad_norm": 0.036345984786748886,
+      "learning_rate": 0.0007222222222222222,
+      "loss": 0.004,
+      "step": 675
+    },
+    {
+      "epoch": 676.0,
+      "grad_norm": 0.05052892118692398,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 0.0041,
+      "step": 676
+    },
+    {
+      "epoch": 677.0,
+      "grad_norm": 0.024914775043725967,
+      "learning_rate": 0.0007177777777777778,
+      "loss": 0.0036,
+      "step": 677
+    },
+    {
+      "epoch": 678.0,
+      "grad_norm": 0.053874969482421875,
+      "learning_rate": 0.0007155555555555555,
+      "loss": 0.0039,
+      "step": 678
+    },
+    {
+      "epoch": 679.0,
+      "grad_norm": 0.029548736289143562,
+      "learning_rate": 0.0007133333333333334,
+      "loss": 0.0035,
+      "step": 679
+    },
+    {
+      "epoch": 680.0,
+      "grad_norm": 0.030973508954048157,
+      "learning_rate": 0.0007111111111111111,
+      "loss": 0.0038,
+      "step": 680
+    },
+    {
+      "epoch": 681.0,
+      "grad_norm": 0.021659094840288162,
+      "learning_rate": 0.0007088888888888889,
+      "loss": 0.0035,
+      "step": 681
+    },
+    {
+      "epoch": 682.0,
+      "grad_norm": 0.05033154413104057,
+      "learning_rate": 0.0007066666666666666,
+      "loss": 0.0042,
+      "step": 682
+    },
+    {
+      "epoch": 683.0,
+      "grad_norm": 0.024461543187499046,
+      "learning_rate": 0.0007044444444444445,
+      "loss": 0.0035,
+      "step": 683
+    },
+    {
+      "epoch": 684.0,
+      "grad_norm": 0.03515414148569107,
+      "learning_rate": 0.0007022222222222222,
+      "loss": 0.0036,
+      "step": 684
+    },
+    {
+      "epoch": 685.0,
+      "grad_norm": 0.024221835657954216,
+      "learning_rate": 0.0007,
+      "loss": 0.0035,
+      "step": 685
+    },
+    {
+      "epoch": 686.0,
+      "grad_norm": 0.0397065207362175,
+      "learning_rate": 0.0006977777777777778,
+      "loss": 0.0035,
+      "step": 686
+    },
+    {
+      "epoch": 687.0,
+      "grad_norm": 0.06702705472707748,
+      "learning_rate": 0.0006955555555555556,
+      "loss": 0.0042,
+      "step": 687
+    },
+    {
+      "epoch": 688.0,
+      "grad_norm": 0.0224533099681139,
+      "learning_rate": 0.0006933333333333333,
+      "loss": 0.0034,
+      "step": 688
+    },
+    {
+      "epoch": 689.0,
+      "grad_norm": 0.029705343768000603,
+      "learning_rate": 0.0006911111111111111,
+      "loss": 0.0035,
+      "step": 689
+    },
+    {
+      "epoch": 690.0,
+      "grad_norm": 0.05282840505242348,
+      "learning_rate": 0.000688888888888889,
+      "loss": 0.0043,
+      "step": 690
+    },
+    {
+      "epoch": 691.0,
+      "grad_norm": 0.04364459589123726,
+      "learning_rate": 0.0006866666666666667,
+      "loss": 0.0042,
+      "step": 691
+    },
+    {
+      "epoch": 692.0,
+      "grad_norm": 0.025150645524263382,
+      "learning_rate": 0.0006844444444444444,
+      "loss": 0.0034,
+      "step": 692
+    },
+    {
+      "epoch": 693.0,
+      "grad_norm": 0.03731248155236244,
+      "learning_rate": 0.0006822222222222222,
+      "loss": 0.0039,
+      "step": 693
+    },
+    {
+      "epoch": 694.0,
+      "grad_norm": 0.04981468245387077,
+      "learning_rate": 0.00068,
+      "loss": 0.0043,
+      "step": 694
+    },
+    {
+      "epoch": 695.0,
+      "grad_norm": 0.03002479299902916,
+      "learning_rate": 0.0006777777777777778,
+      "loss": 0.0039,
+      "step": 695
+    },
+    {
+      "epoch": 696.0,
+      "grad_norm": 0.024293873459100723,
+      "learning_rate": 0.0006755555555555555,
+      "loss": 0.0035,
+      "step": 696
+    },
+    {
+      "epoch": 697.0,
+      "grad_norm": 0.042657673358917236,
+      "learning_rate": 0.0006733333333333334,
+      "loss": 0.0038,
+      "step": 697
+    },
+    {
+      "epoch": 698.0,
+      "grad_norm": 0.05678756162524223,
+      "learning_rate": 0.0006711111111111111,
+      "loss": 0.0042,
+      "step": 698
+    },
+    {
+      "epoch": 699.0,
+      "grad_norm": 0.024007895961403847,
+      "learning_rate": 0.0006688888888888889,
+      "loss": 0.0035,
+      "step": 699
+    },
+    {
+      "epoch": 700.0,
+      "grad_norm": 0.041944634169340134,
+      "learning_rate": 0.0006666666666666666,
+      "loss": 0.0038,
+      "step": 700
+    },
+    {
+      "epoch": 701.0,
+      "grad_norm": 0.04064980521798134,
+      "learning_rate": 0.0006644444444444444,
+      "loss": 0.0039,
+      "step": 701
+    },
+    {
+      "epoch": 702.0,
+      "grad_norm": 0.02716059610247612,
+      "learning_rate": 0.0006622222222222222,
+      "loss": 0.0036,
+      "step": 702
+    },
+    {
+      "epoch": 703.0,
+      "grad_norm": 0.023927675560116768,
+      "learning_rate": 0.00066,
+      "loss": 0.0035,
+      "step": 703
+    },
+    {
+      "epoch": 704.0,
+      "grad_norm": 0.03374261036515236,
+      "learning_rate": 0.0006577777777777779,
+      "loss": 0.0035,
+      "step": 704
+    },
+    {
+      "epoch": 705.0,
+      "grad_norm": 0.026420941576361656,
+      "learning_rate": 0.0006555555555555556,
+      "loss": 0.0037,
+      "step": 705
+    },
+    {
+      "epoch": 706.0,
+      "grad_norm": 0.03130070120096207,
+      "learning_rate": 0.0006533333333333333,
+      "loss": 0.0037,
+      "step": 706
+    },
+    {
+      "epoch": 707.0,
+      "grad_norm": 0.024263912811875343,
+      "learning_rate": 0.0006511111111111111,
+      "loss": 0.0034,
+      "step": 707
+    },
+    {
+      "epoch": 708.0,
+      "grad_norm": 0.027501242235302925,
+      "learning_rate": 0.0006488888888888888,
+      "loss": 0.0038,
+      "step": 708
+    },
+    {
+      "epoch": 709.0,
+      "grad_norm": 0.02567414566874504,
+      "learning_rate": 0.0006466666666666666,
+      "loss": 0.0035,
+      "step": 709
+    },
+    {
+      "epoch": 710.0,
+      "grad_norm": 0.024132689461112022,
+      "learning_rate": 0.0006444444444444444,
+      "loss": 0.0036,
+      "step": 710
+    },
+    {
+      "epoch": 711.0,
+      "grad_norm": 0.024011000990867615,
+      "learning_rate": 0.0006422222222222223,
+      "loss": 0.0036,
+      "step": 711
+    },
+    {
+      "epoch": 712.0,
+      "grad_norm": 0.02344098687171936,
+      "learning_rate": 0.00064,
+      "loss": 0.0036,
+      "step": 712
+    },
+    {
+      "epoch": 713.0,
+      "grad_norm": 0.023972654715180397,
+      "learning_rate": 0.0006377777777777778,
+      "loss": 0.0037,
+      "step": 713
+    },
+    {
+      "epoch": 714.0,
+      "grad_norm": 0.029682451859116554,
+      "learning_rate": 0.0006355555555555555,
+      "loss": 0.0036,
+      "step": 714
+    },
+    {
+      "epoch": 715.0,
+      "grad_norm": 0.02595234103500843,
+      "learning_rate": 0.0006333333333333333,
+      "loss": 0.0034,
+      "step": 715
+    },
+    {
+      "epoch": 716.0,
+      "grad_norm": 0.023837080225348473,
+      "learning_rate": 0.000631111111111111,
+      "loss": 0.0036,
+      "step": 716
+    },
+    {
+      "epoch": 717.0,
+      "grad_norm": 0.026442958042025566,
+      "learning_rate": 0.000628888888888889,
+      "loss": 0.0036,
+      "step": 717
+    },
+    {
+      "epoch": 718.0,
+      "grad_norm": 0.024803321808576584,
+      "learning_rate": 0.0006266666666666668,
+      "loss": 0.0036,
+      "step": 718
+    },
+    {
+      "epoch": 719.0,
+      "grad_norm": 0.023358143866062164,
+      "learning_rate": 0.0006244444444444445,
+      "loss": 0.0035,
+      "step": 719
+    },
+    {
+      "epoch": 720.0,
+      "grad_norm": 0.026576291769742966,
+      "learning_rate": 0.0006222222222222223,
+      "loss": 0.0033,
+      "step": 720
+    },
+    {
+      "epoch": 721.0,
+      "grad_norm": 0.030121877789497375,
+      "learning_rate": 0.00062,
+      "loss": 0.0035,
+      "step": 721
+    },
+    {
+      "epoch": 722.0,
+      "grad_norm": 0.02631719410419464,
+      "learning_rate": 0.0006177777777777777,
+      "loss": 0.0038,
+      "step": 722
+    },
+    {
+      "epoch": 723.0,
+      "grad_norm": 0.027765462175011635,
+      "learning_rate": 0.0006155555555555555,
+      "loss": 0.0034,
+      "step": 723
+    },
+    {
+      "epoch": 724.0,
+      "grad_norm": 0.031482163816690445,
+      "learning_rate": 0.0006133333333333334,
+      "loss": 0.0035,
+      "step": 724
+    },
+    {
+      "epoch": 725.0,
+      "grad_norm": 0.023726513609290123,
+      "learning_rate": 0.0006111111111111112,
+      "loss": 0.0033,
+      "step": 725
+    },
+    {
+      "epoch": 726.0,
+      "grad_norm": 0.02765974961221218,
+      "learning_rate": 0.000608888888888889,
+      "loss": 0.0034,
+      "step": 726
+    },
+    {
+      "epoch": 727.0,
+      "grad_norm": 0.03566696122288704,
+      "learning_rate": 0.0006066666666666667,
+      "loss": 0.0036,
+      "step": 727
+    },
+    {
+      "epoch": 728.0,
+      "grad_norm": 0.028009934350848198,
+      "learning_rate": 0.0006044444444444445,
+      "loss": 0.0035,
+      "step": 728
+    },
+    {
+      "epoch": 729.0,
+      "grad_norm": 0.02318991906940937,
+      "learning_rate": 0.0006022222222222222,
+      "loss": 0.0032,
+      "step": 729
+    },
+    {
+      "epoch": 730.0,
+      "grad_norm": 0.022554708644747734,
+      "learning_rate": 0.0006,
+      "loss": 0.0035,
+      "step": 730
+    },
+    {
+      "epoch": 731.0,
+      "grad_norm": 0.02474828064441681,
+      "learning_rate": 0.0005977777777777778,
+      "loss": 0.0033,
+      "step": 731
+    },
+    {
+      "epoch": 732.0,
+      "grad_norm": 0.0323016531765461,
+      "learning_rate": 0.0005955555555555556,
+      "loss": 0.0034,
+      "step": 732
+    },
+    {
+      "epoch": 733.0,
+      "grad_norm": 0.019244784489274025,
+      "learning_rate": 0.0005933333333333334,
+      "loss": 0.0033,
+      "step": 733
+    },
+    {
+      "epoch": 734.0,
+      "grad_norm": 0.021869376301765442,
+      "learning_rate": 0.0005911111111111112,
+      "loss": 0.0032,
+      "step": 734
+    },
+    {
+      "epoch": 735.0,
+      "grad_norm": 0.01879352703690529,
+      "learning_rate": 0.0005888888888888889,
+      "loss": 0.0032,
+      "step": 735
+    },
+    {
+      "epoch": 736.0,
+      "grad_norm": 0.026631703600287437,
+      "learning_rate": 0.0005866666666666667,
+      "loss": 0.0033,
+      "step": 736
+    },
+    {
+      "epoch": 737.0,
+      "grad_norm": 0.028665434569120407,
+      "learning_rate": 0.0005844444444444444,
+      "loss": 0.0035,
+      "step": 737
+    },
+    {
+      "epoch": 738.0,
+      "grad_norm": 0.01596708409488201,
+      "learning_rate": 0.0005822222222222223,
+      "loss": 0.003,
+      "step": 738
+    },
+    {
+      "epoch": 739.0,
+      "grad_norm": 0.017112715169787407,
+      "learning_rate": 0.00058,
+      "loss": 0.0031,
+      "step": 739
+    },
+    {
+      "epoch": 740.0,
+      "grad_norm": 0.0246286503970623,
+      "learning_rate": 0.0005777777777777778,
+      "loss": 0.0034,
+      "step": 740
+    },
+    {
+      "epoch": 741.0,
+      "grad_norm": 0.01566813327372074,
+      "learning_rate": 0.0005755555555555556,
+      "loss": 0.003,
+      "step": 741
+    },
+    {
+      "epoch": 742.0,
+      "grad_norm": 0.02360912226140499,
+      "learning_rate": 0.0005733333333333334,
+      "loss": 0.0033,
+      "step": 742
+    },
+    {
+      "epoch": 743.0,
+      "grad_norm": 0.024110812693834305,
+      "learning_rate": 0.0005711111111111111,
+      "loss": 0.0033,
+      "step": 743
+    },
+    {
+      "epoch": 744.0,
+      "grad_norm": 0.020427672192454338,
+      "learning_rate": 0.0005688888888888889,
+      "loss": 0.0032,
+      "step": 744
+    },
+    {
+      "epoch": 745.0,
+      "grad_norm": 0.025975676253437996,
+      "learning_rate": 0.0005666666666666667,
+      "loss": 0.0031,
+      "step": 745
+    },
+    {
+      "epoch": 746.0,
+      "grad_norm": 0.022355573251843452,
+      "learning_rate": 0.0005644444444444445,
+      "loss": 0.0033,
+      "step": 746
+    },
+    {
+      "epoch": 747.0,
+      "grad_norm": 0.04759243130683899,
+      "learning_rate": 0.0005622222222222222,
+      "loss": 0.0037,
+      "step": 747
+    },
+    {
+      "epoch": 748.0,
+      "grad_norm": 0.030733415856957436,
+      "learning_rate": 0.0005600000000000001,
+      "loss": 0.0033,
+      "step": 748
+    },
+    {
+      "epoch": 749.0,
+      "grad_norm": 0.02027864381670952,
+      "learning_rate": 0.0005577777777777778,
+      "loss": 0.0033,
+      "step": 749
+    },
+    {
+      "epoch": 750.0,
+      "grad_norm": 0.028928080573678017,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 0.0031,
+      "step": 750
+    },
+    {
+      "epoch": 751.0,
+      "grad_norm": 0.03321721404790878,
+      "learning_rate": 0.0005533333333333333,
+      "loss": 0.0034,
+      "step": 751
+    },
+    {
+      "epoch": 752.0,
+      "grad_norm": 0.036787249147892,
+      "learning_rate": 0.0005511111111111112,
+      "loss": 0.0034,
+      "step": 752
+    },
+    {
+      "epoch": 753.0,
+      "grad_norm": 0.021165387704968452,
+      "learning_rate": 0.0005488888888888889,
+      "loss": 0.0031,
+      "step": 753
+    },
+    {
+      "epoch": 754.0,
+      "grad_norm": 0.025929953902959824,
+      "learning_rate": 0.0005466666666666667,
+      "loss": 0.0036,
+      "step": 754
+    },
+    {
+      "epoch": 755.0,
+      "grad_norm": 0.0214844960719347,
+      "learning_rate": 0.0005444444444444444,
+      "loss": 0.0031,
+      "step": 755
+    },
+    {
+      "epoch": 756.0,
+      "grad_norm": 0.030406184494495392,
+      "learning_rate": 0.0005422222222222223,
+      "loss": 0.0034,
+      "step": 756
+    },
+    {
+      "epoch": 757.0,
+      "grad_norm": 0.0321720615029335,
+      "learning_rate": 0.00054,
+      "loss": 0.0035,
+      "step": 757
+    },
+    {
+      "epoch": 758.0,
+      "grad_norm": 0.026725683361291885,
+      "learning_rate": 0.0005377777777777779,
+      "loss": 0.0032,
+      "step": 758
+    },
+    {
+      "epoch": 759.0,
+      "grad_norm": 0.031080015003681183,
+      "learning_rate": 0.0005355555555555556,
+      "loss": 0.0035,
+      "step": 759
+    },
+    {
+      "epoch": 760.0,
+      "grad_norm": 0.023731930181384087,
+      "learning_rate": 0.0005333333333333334,
+      "loss": 0.0031,
+      "step": 760
+    },
+    {
+      "epoch": 761.0,
+      "grad_norm": 0.026623785495758057,
+      "learning_rate": 0.0005311111111111111,
+      "loss": 0.0032,
+      "step": 761
+    },
+    {
+      "epoch": 762.0,
+      "grad_norm": 0.042063500732183456,
+      "learning_rate": 0.0005288888888888889,
+      "loss": 0.0032,
+      "step": 762
+    },
+    {
+      "epoch": 763.0,
+      "grad_norm": 0.021555962041020393,
+      "learning_rate": 0.0005266666666666666,
+      "loss": 0.0031,
+      "step": 763
+    },
+    {
+      "epoch": 764.0,
+      "grad_norm": 0.026008032262325287,
+      "learning_rate": 0.0005244444444444445,
+      "loss": 0.0034,
+      "step": 764
+    },
+    {
+      "epoch": 765.0,
+      "grad_norm": 0.01928178407251835,
+      "learning_rate": 0.0005222222222222223,
+      "loss": 0.003,
+      "step": 765
+    },
+    {
+      "epoch": 766.0,
+      "grad_norm": 0.02430052123963833,
+      "learning_rate": 0.0005200000000000001,
+      "loss": 0.0031,
+      "step": 766
+    },
+    {
+      "epoch": 767.0,
+      "grad_norm": 0.028112513944506645,
+      "learning_rate": 0.0005177777777777778,
+      "loss": 0.0033,
+      "step": 767
+    },
+    {
+      "epoch": 768.0,
+      "grad_norm": 0.019901221618056297,
+      "learning_rate": 0.0005155555555555556,
+      "loss": 0.003,
+      "step": 768
+    },
+    {
+      "epoch": 769.0,
+      "grad_norm": 0.02807488478720188,
+      "learning_rate": 0.0005133333333333333,
+      "loss": 0.0031,
+      "step": 769
+    },
+    {
+      "epoch": 770.0,
+      "grad_norm": 0.027293385937809944,
+      "learning_rate": 0.0005111111111111111,
+      "loss": 0.0033,
+      "step": 770
+    },
+    {
+      "epoch": 771.0,
+      "grad_norm": 0.022820137441158295,
+      "learning_rate": 0.0005088888888888888,
+      "loss": 0.003,
+      "step": 771
+    },
+    {
+      "epoch": 772.0,
+      "grad_norm": 0.028339603915810585,
+      "learning_rate": 0.0005066666666666668,
+      "loss": 0.0033,
+      "step": 772
+    },
+    {
+      "epoch": 773.0,
+      "grad_norm": 0.027798311784863472,
+      "learning_rate": 0.0005044444444444445,
+      "loss": 0.0032,
+      "step": 773
+    },
+    {
+      "epoch": 774.0,
+      "grad_norm": 0.037513189017772675,
+      "learning_rate": 0.0005022222222222223,
+      "loss": 0.0031,
+      "step": 774
+    },
+    {
+      "epoch": 775.0,
+      "grad_norm": 0.029608087614178658,
+      "learning_rate": 0.0005,
+      "loss": 0.0031,
+      "step": 775
+    },
+    {
+      "epoch": 776.0,
+      "grad_norm": 0.02258935756981373,
+      "learning_rate": 0.0004977777777777778,
+      "loss": 0.0034,
+      "step": 776
+    },
+    {
+      "epoch": 777.0,
+      "grad_norm": 0.03222902864217758,
+      "learning_rate": 0.0004955555555555556,
+      "loss": 0.0037,
+      "step": 777
+    },
+    {
+      "epoch": 778.0,
+      "grad_norm": 0.028507541865110397,
+      "learning_rate": 0.0004933333333333334,
+      "loss": 0.0033,
+      "step": 778
+    },
+    {
+      "epoch": 779.0,
+      "grad_norm": 0.026021234691143036,
+      "learning_rate": 0.0004911111111111111,
+      "loss": 0.0031,
+      "step": 779
+    },
+    {
+      "epoch": 780.0,
+      "grad_norm": 0.03054329752922058,
+      "learning_rate": 0.0004888888888888889,
+      "loss": 0.0033,
+      "step": 780
+    },
+    {
+      "epoch": 781.0,
+      "grad_norm": 0.025316089391708374,
+      "learning_rate": 0.0004866666666666667,
+      "loss": 0.0033,
+      "step": 781
+    },
+    {
+      "epoch": 782.0,
+      "grad_norm": 0.02000274695456028,
+      "learning_rate": 0.00048444444444444446,
+      "loss": 0.003,
+      "step": 782
+    },
+    {
+      "epoch": 783.0,
+      "grad_norm": 0.03106926940381527,
+      "learning_rate": 0.0004822222222222222,
+      "loss": 0.0033,
+      "step": 783
+    },
+    {
+      "epoch": 784.0,
+      "grad_norm": 0.02422090247273445,
+      "learning_rate": 0.00048,
+      "loss": 0.0032,
+      "step": 784
+    },
+    {
+      "epoch": 785.0,
+      "grad_norm": 0.03346557170152664,
+      "learning_rate": 0.0004777777777777778,
+      "loss": 0.0033,
+      "step": 785
+    },
+    {
+      "epoch": 786.0,
+      "grad_norm": 0.016884582117199898,
+      "learning_rate": 0.00047555555555555556,
+      "loss": 0.003,
+      "step": 786
+    },
+    {
+      "epoch": 787.0,
+      "grad_norm": 0.023125050589442253,
+      "learning_rate": 0.00047333333333333336,
+      "loss": 0.0031,
+      "step": 787
+    },
+    {
+      "epoch": 788.0,
+      "grad_norm": 0.015507596544921398,
+      "learning_rate": 0.0004711111111111111,
+      "loss": 0.003,
+      "step": 788
+    },
+    {
+      "epoch": 789.0,
+      "grad_norm": 0.02069436013698578,
+      "learning_rate": 0.0004688888888888889,
+      "loss": 0.0032,
+      "step": 789
+    },
+    {
+      "epoch": 790.0,
+      "grad_norm": 0.022422535344958305,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 0.0032,
+      "step": 790
+    },
+    {
+      "epoch": 791.0,
+      "grad_norm": 0.02150949463248253,
+      "learning_rate": 0.00046444444444444446,
+      "loss": 0.003,
+      "step": 791
+    },
+    {
+      "epoch": 792.0,
+      "grad_norm": 0.03762350231409073,
+      "learning_rate": 0.0004622222222222222,
+      "loss": 0.0034,
+      "step": 792
+    },
+    {
+      "epoch": 793.0,
+      "grad_norm": 0.018060874193906784,
+      "learning_rate": 0.00046,
+      "loss": 0.003,
+      "step": 793
+    },
+    {
+      "epoch": 794.0,
+      "grad_norm": 0.023126404732465744,
+      "learning_rate": 0.0004577777777777778,
+      "loss": 0.003,
+      "step": 794
+    },
+    {
+      "epoch": 795.0,
+      "grad_norm": 0.024747442454099655,
+      "learning_rate": 0.00045555555555555556,
+      "loss": 0.0031,
+      "step": 795
+    },
+    {
+      "epoch": 796.0,
+      "grad_norm": 0.03307885304093361,
+      "learning_rate": 0.0004533333333333333,
+      "loss": 0.003,
+      "step": 796
+    },
+    {
+      "epoch": 797.0,
+      "grad_norm": 0.025574902072548866,
+      "learning_rate": 0.0004511111111111111,
+      "loss": 0.003,
+      "step": 797
+    },
+    {
+      "epoch": 798.0,
+      "grad_norm": 0.018072202801704407,
+      "learning_rate": 0.0004488888888888889,
+      "loss": 0.003,
+      "step": 798
+    },
+    {
+      "epoch": 799.0,
+      "grad_norm": 0.02339911088347435,
+      "learning_rate": 0.00044666666666666666,
+      "loss": 0.0031,
+      "step": 799
+    },
+    {
+      "epoch": 800.0,
+      "grad_norm": 0.02469002641737461,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 0.003,
+      "step": 800
+    },
+    {
+      "epoch": 801.0,
+      "grad_norm": 0.023919183760881424,
+      "learning_rate": 0.00044222222222222227,
+      "loss": 0.0034,
+      "step": 801
+    },
+    {
+      "epoch": 802.0,
+      "grad_norm": 0.018128234893083572,
+      "learning_rate": 0.00044,
+      "loss": 0.0028,
+      "step": 802
+    },
+    {
+      "epoch": 803.0,
+      "grad_norm": 0.024188602343201637,
+      "learning_rate": 0.00043777777777777776,
+      "loss": 0.0031,
+      "step": 803
+    },
+    {
+      "epoch": 804.0,
+      "grad_norm": 0.02027260698378086,
+      "learning_rate": 0.0004355555555555555,
+      "loss": 0.0032,
+      "step": 804
+    },
+    {
+      "epoch": 805.0,
+      "grad_norm": 0.019797317683696747,
+      "learning_rate": 0.00043333333333333337,
+      "loss": 0.003,
+      "step": 805
+    },
+    {
+      "epoch": 806.0,
+      "grad_norm": 0.027181904762983322,
+      "learning_rate": 0.0004311111111111111,
+      "loss": 0.0033,
+      "step": 806
+    },
+    {
+      "epoch": 807.0,
+      "grad_norm": 0.03041798062622547,
+      "learning_rate": 0.00042888888888888886,
+      "loss": 0.0031,
+      "step": 807
+    },
+    {
+      "epoch": 808.0,
+      "grad_norm": 0.025036826729774475,
+      "learning_rate": 0.0004266666666666667,
+      "loss": 0.003,
+      "step": 808
+    },
+    {
+      "epoch": 809.0,
+      "grad_norm": 0.02821156196296215,
+      "learning_rate": 0.00042444444444444447,
+      "loss": 0.003,
+      "step": 809
+    },
+    {
+      "epoch": 810.0,
+      "grad_norm": 0.01684625819325447,
+      "learning_rate": 0.0004222222222222222,
+      "loss": 0.0028,
+      "step": 810
+    },
+    {
+      "epoch": 811.0,
+      "grad_norm": 0.014117077924311161,
+      "learning_rate": 0.00042,
+      "loss": 0.0027,
+      "step": 811
+    },
+    {
+      "epoch": 812.0,
+      "grad_norm": 0.028561661019921303,
+      "learning_rate": 0.0004177777777777778,
+      "loss": 0.003,
+      "step": 812
+    },
+    {
+      "epoch": 813.0,
+      "grad_norm": 0.022630490362644196,
+      "learning_rate": 0.00041555555555555557,
+      "loss": 0.003,
+      "step": 813
+    },
+    {
+      "epoch": 814.0,
+      "grad_norm": 0.031984347850084305,
+      "learning_rate": 0.0004133333333333333,
+      "loss": 0.0033,
+      "step": 814
+    },
+    {
+      "epoch": 815.0,
+      "grad_norm": 0.02748434990644455,
+      "learning_rate": 0.0004111111111111111,
+      "loss": 0.0031,
+      "step": 815
+    },
+    {
+      "epoch": 816.0,
+      "grad_norm": 0.02543744631111622,
+      "learning_rate": 0.0004088888888888889,
+      "loss": 0.003,
+      "step": 816
+    },
+    {
+      "epoch": 817.0,
+      "grad_norm": 0.020958127453923225,
+      "learning_rate": 0.00040666666666666667,
+      "loss": 0.003,
+      "step": 817
+    },
+    {
+      "epoch": 818.0,
+      "grad_norm": 0.029978320002555847,
+      "learning_rate": 0.00040444444444444447,
+      "loss": 0.0033,
+      "step": 818
+    },
+    {
+      "epoch": 819.0,
+      "grad_norm": 0.03185059130191803,
+      "learning_rate": 0.0004022222222222222,
+      "loss": 0.0033,
+      "step": 819
+    },
+    {
+      "epoch": 820.0,
+      "grad_norm": 0.015984434634447098,
+      "learning_rate": 0.0004,
+      "loss": 0.0029,
+      "step": 820
+    },
+    {
+      "epoch": 821.0,
+      "grad_norm": 0.016744885593652725,
+      "learning_rate": 0.00039777777777777777,
+      "loss": 0.0028,
+      "step": 821
+    },
+    {
+      "epoch": 822.0,
+      "grad_norm": 0.02187785878777504,
+      "learning_rate": 0.00039555555555555557,
+      "loss": 0.0029,
+      "step": 822
+    },
+    {
+      "epoch": 823.0,
+      "grad_norm": 0.013766797259449959,
+      "learning_rate": 0.0003933333333333333,
+      "loss": 0.0027,
+      "step": 823
+    },
+    {
+      "epoch": 824.0,
+      "grad_norm": 0.04106425866484642,
+      "learning_rate": 0.0003911111111111111,
+      "loss": 0.0034,
+      "step": 824
+    },
+    {
+      "epoch": 825.0,
+      "grad_norm": 0.03544626384973526,
+      "learning_rate": 0.0003888888888888889,
+      "loss": 0.003,
+      "step": 825
+    },
+    {
+      "epoch": 826.0,
+      "grad_norm": 0.023083612322807312,
+      "learning_rate": 0.00038666666666666667,
+      "loss": 0.0031,
+      "step": 826
+    },
+    {
+      "epoch": 827.0,
+      "grad_norm": 0.01776996999979019,
+      "learning_rate": 0.0003844444444444444,
+      "loss": 0.0029,
+      "step": 827
+    },
+    {
+      "epoch": 828.0,
+      "grad_norm": 0.029645999893546104,
+      "learning_rate": 0.0003822222222222223,
+      "loss": 0.0032,
+      "step": 828
+    },
+    {
+      "epoch": 829.0,
+      "grad_norm": 0.024389177560806274,
+      "learning_rate": 0.00038,
+      "loss": 0.003,
+      "step": 829
+    },
+    {
+      "epoch": 830.0,
+      "grad_norm": 0.03170039877295494,
+      "learning_rate": 0.00037777777777777777,
+      "loss": 0.0033,
+      "step": 830
+    },
+    {
+      "epoch": 831.0,
+      "grad_norm": 0.020817887037992477,
+      "learning_rate": 0.0003755555555555555,
+      "loss": 0.0032,
+      "step": 831
+    },
+    {
+      "epoch": 832.0,
+      "grad_norm": 0.01924346759915352,
+      "learning_rate": 0.0003733333333333334,
+      "loss": 0.0028,
+      "step": 832
+    },
+    {
+      "epoch": 833.0,
+      "grad_norm": 0.01933015137910843,
+      "learning_rate": 0.0003711111111111111,
+      "loss": 0.0029,
+      "step": 833
+    },
+    {
+      "epoch": 834.0,
+      "grad_norm": 0.02916400507092476,
+      "learning_rate": 0.00036888888888888887,
+      "loss": 0.0031,
+      "step": 834
+    },
+    {
+      "epoch": 835.0,
+      "grad_norm": 0.020619528368115425,
+      "learning_rate": 0.00036666666666666667,
+      "loss": 0.003,
+      "step": 835
+    },
+    {
+      "epoch": 836.0,
+      "grad_norm": 0.025901442393660545,
+      "learning_rate": 0.00036444444444444447,
+      "loss": 0.0032,
+      "step": 836
+    },
+    {
+      "epoch": 837.0,
+      "grad_norm": 0.02749483659863472,
+      "learning_rate": 0.0003622222222222222,
+      "loss": 0.0031,
+      "step": 837
+    },
+    {
+      "epoch": 838.0,
+      "grad_norm": 0.01978285051882267,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.0028,
+      "step": 838
+    },
+    {
+      "epoch": 839.0,
+      "grad_norm": 0.023931678384542465,
+      "learning_rate": 0.00035777777777777777,
+      "loss": 0.0029,
+      "step": 839
+    },
+    {
+      "epoch": 840.0,
+      "grad_norm": 0.016439393162727356,
+      "learning_rate": 0.00035555555555555557,
+      "loss": 0.0028,
+      "step": 840
+    },
+    {
+      "epoch": 841.0,
+      "grad_norm": 0.024643810465931892,
+      "learning_rate": 0.0003533333333333333,
+      "loss": 0.0029,
+      "step": 841
+    },
+    {
+      "epoch": 842.0,
+      "grad_norm": 0.025052694603800774,
+      "learning_rate": 0.0003511111111111111,
+      "loss": 0.0029,
+      "step": 842
+    },
+    {
+      "epoch": 843.0,
+      "grad_norm": 0.02013804018497467,
+      "learning_rate": 0.0003488888888888889,
+      "loss": 0.0029,
+      "step": 843
+    },
+    {
+      "epoch": 844.0,
+      "grad_norm": 0.019899819046258926,
+      "learning_rate": 0.00034666666666666667,
+      "loss": 0.0029,
+      "step": 844
+    },
+    {
+      "epoch": 845.0,
+      "grad_norm": 0.020912861451506615,
+      "learning_rate": 0.0003444444444444445,
+      "loss": 0.0029,
+      "step": 845
+    },
+    {
+      "epoch": 846.0,
+      "grad_norm": 0.015246815979480743,
+      "learning_rate": 0.0003422222222222222,
+      "loss": 0.0028,
+      "step": 846
+    },
+    {
+      "epoch": 847.0,
+      "grad_norm": 0.026435496285557747,
+      "learning_rate": 0.00034,
+      "loss": 0.0032,
+      "step": 847
+    },
+    {
+      "epoch": 848.0,
+      "grad_norm": 0.019885210320353508,
+      "learning_rate": 0.00033777777777777777,
+      "loss": 0.0028,
+      "step": 848
+    },
+    {
+      "epoch": 849.0,
+      "grad_norm": 0.020924292504787445,
+      "learning_rate": 0.0003355555555555556,
+      "loss": 0.0029,
+      "step": 849
+    },
+    {
+      "epoch": 850.0,
+      "grad_norm": 0.019253870472311974,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 0.0029,
+      "step": 850
+    },
+    {
+      "epoch": 851.0,
+      "grad_norm": 0.019240371882915497,
+      "learning_rate": 0.0003311111111111111,
+      "loss": 0.0028,
+      "step": 851
+    },
+    {
+      "epoch": 852.0,
+      "grad_norm": 0.021707167848944664,
+      "learning_rate": 0.0003288888888888889,
+      "loss": 0.003,
+      "step": 852
+    },
+    {
+      "epoch": 853.0,
+      "grad_norm": 0.020251473411917686,
+      "learning_rate": 0.0003266666666666667,
+      "loss": 0.003,
+      "step": 853
+    },
+    {
+      "epoch": 854.0,
+      "grad_norm": 0.020360205322504044,
+      "learning_rate": 0.0003244444444444444,
+      "loss": 0.0028,
+      "step": 854
+    },
+    {
+      "epoch": 855.0,
+      "grad_norm": 0.019384529441595078,
+      "learning_rate": 0.0003222222222222222,
+      "loss": 0.0029,
+      "step": 855
+    },
+    {
+      "epoch": 856.0,
+      "grad_norm": 0.02220081351697445,
+      "learning_rate": 0.00032,
+      "loss": 0.0032,
+      "step": 856
+    },
+    {
+      "epoch": 857.0,
+      "grad_norm": 0.02192023955285549,
+      "learning_rate": 0.0003177777777777778,
+      "loss": 0.0029,
+      "step": 857
+    },
+    {
+      "epoch": 858.0,
+      "grad_norm": 0.020040197297930717,
+      "learning_rate": 0.0003155555555555555,
+      "loss": 0.0029,
+      "step": 858
+    },
+    {
+      "epoch": 859.0,
+      "grad_norm": 0.022478275001049042,
+      "learning_rate": 0.0003133333333333334,
+      "loss": 0.0031,
+      "step": 859
+    },
+    {
+      "epoch": 860.0,
+      "grad_norm": 0.017191395163536072,
+      "learning_rate": 0.0003111111111111111,
+      "loss": 0.0027,
+      "step": 860
+    },
+    {
+      "epoch": 861.0,
+      "grad_norm": 0.025374887511134148,
+      "learning_rate": 0.0003088888888888889,
+      "loss": 0.0031,
+      "step": 861
+    },
+    {
+      "epoch": 862.0,
+      "grad_norm": 0.01610608585178852,
+      "learning_rate": 0.0003066666666666667,
+      "loss": 0.0027,
+      "step": 862
+    },
+    {
+      "epoch": 863.0,
+      "grad_norm": 0.019191846251487732,
+      "learning_rate": 0.0003044444444444445,
+      "loss": 0.0027,
+      "step": 863
+    },
+    {
+      "epoch": 864.0,
+      "grad_norm": 0.021266650408506393,
+      "learning_rate": 0.0003022222222222222,
+      "loss": 0.0029,
+      "step": 864
+    },
+    {
+      "epoch": 865.0,
+      "grad_norm": 0.024204988032579422,
+      "learning_rate": 0.0003,
+      "loss": 0.003,
+      "step": 865
+    },
+    {
+      "epoch": 866.0,
+      "grad_norm": 0.018310649320483208,
+      "learning_rate": 0.0002977777777777778,
+      "loss": 0.0027,
+      "step": 866
+    },
+    {
+      "epoch": 867.0,
+      "grad_norm": 0.017516782507300377,
+      "learning_rate": 0.0002955555555555556,
+      "loss": 0.0029,
+      "step": 867
+    },
+    {
+      "epoch": 868.0,
+      "grad_norm": 0.021248290315270424,
+      "learning_rate": 0.0002933333333333333,
+      "loss": 0.0029,
+      "step": 868
+    },
+    {
+      "epoch": 869.0,
+      "grad_norm": 0.01870272122323513,
+      "learning_rate": 0.00029111111111111113,
+      "loss": 0.0028,
+      "step": 869
+    },
+    {
+      "epoch": 870.0,
+      "grad_norm": 0.02665873058140278,
+      "learning_rate": 0.0002888888888888889,
+      "loss": 0.003,
+      "step": 870
+    },
+    {
+      "epoch": 871.0,
+      "grad_norm": 0.02477414719760418,
+      "learning_rate": 0.0002866666666666667,
+      "loss": 0.003,
+      "step": 871
+    },
+    {
+      "epoch": 872.0,
+      "grad_norm": 0.023296542465686798,
+      "learning_rate": 0.0002844444444444444,
+      "loss": 0.0031,
+      "step": 872
+    },
+    {
+      "epoch": 873.0,
+      "grad_norm": 0.03521310165524483,
+      "learning_rate": 0.00028222222222222223,
+      "loss": 0.0029,
+      "step": 873
+    },
+    {
+      "epoch": 874.0,
+      "grad_norm": 0.020849304273724556,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.0028,
+      "step": 874
+    },
+    {
+      "epoch": 875.0,
+      "grad_norm": 0.023307524621486664,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 0.003,
+      "step": 875
+    },
+    {
+      "epoch": 876.0,
+      "grad_norm": 0.01824437826871872,
+      "learning_rate": 0.0002755555555555556,
+      "loss": 0.0025,
+      "step": 876
+    },
+    {
+      "epoch": 877.0,
+      "grad_norm": 0.02158845216035843,
+      "learning_rate": 0.00027333333333333333,
+      "loss": 0.0029,
+      "step": 877
+    },
+    {
+      "epoch": 878.0,
+      "grad_norm": 0.02377997152507305,
+      "learning_rate": 0.00027111111111111113,
+      "loss": 0.0029,
+      "step": 878
+    },
+    {
+      "epoch": 879.0,
+      "grad_norm": 0.024584239348769188,
+      "learning_rate": 0.00026888888888888893,
+      "loss": 0.0031,
+      "step": 879
+    },
+    {
+      "epoch": 880.0,
+      "grad_norm": 0.016272418200969696,
+      "learning_rate": 0.0002666666666666667,
+      "loss": 0.0027,
+      "step": 880
+    },
+    {
+      "epoch": 881.0,
+      "grad_norm": 0.01953684352338314,
+      "learning_rate": 0.00026444444444444443,
+      "loss": 0.0027,
+      "step": 881
+    },
+    {
+      "epoch": 882.0,
+      "grad_norm": 0.023698432371020317,
+      "learning_rate": 0.00026222222222222223,
+      "loss": 0.003,
+      "step": 882
+    },
+    {
+      "epoch": 883.0,
+      "grad_norm": 0.021787166595458984,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.003,
+      "step": 883
+    },
+    {
+      "epoch": 884.0,
+      "grad_norm": 0.026674091815948486,
+      "learning_rate": 0.0002577777777777778,
+      "loss": 0.0027,
+      "step": 884
+    },
+    {
+      "epoch": 885.0,
+      "grad_norm": 0.02600809372961521,
+      "learning_rate": 0.00025555555555555553,
+      "loss": 0.0028,
+      "step": 885
+    },
+    {
+      "epoch": 886.0,
+      "grad_norm": 0.0214143767952919,
+      "learning_rate": 0.0002533333333333334,
+      "loss": 0.0027,
+      "step": 886
+    },
+    {
+      "epoch": 887.0,
+      "grad_norm": 0.01773553155362606,
+      "learning_rate": 0.00025111111111111113,
+      "loss": 0.0028,
+      "step": 887
+    },
+    {
+      "epoch": 888.0,
+      "grad_norm": 0.016479508951306343,
+      "learning_rate": 0.0002488888888888889,
+      "loss": 0.0027,
+      "step": 888
+    },
+    {
+      "epoch": 889.0,
+      "grad_norm": 0.026842381805181503,
+      "learning_rate": 0.0002466666666666667,
+      "loss": 0.0031,
+      "step": 889
+    },
+    {
+      "epoch": 890.0,
+      "grad_norm": 0.029731744900345802,
+      "learning_rate": 0.00024444444444444443,
+      "loss": 0.0028,
+      "step": 890
+    },
+    {
+      "epoch": 891.0,
+      "grad_norm": 0.018305297940969467,
+      "learning_rate": 0.00024222222222222223,
+      "loss": 0.0028,
+      "step": 891
+    },
+    {
+      "epoch": 892.0,
+      "grad_norm": 0.020879078656435013,
+      "learning_rate": 0.00024,
+      "loss": 0.0027,
+      "step": 892
+    },
+    {
+      "epoch": 893.0,
+      "grad_norm": 0.025136977434158325,
+      "learning_rate": 0.00023777777777777778,
+      "loss": 0.0027,
+      "step": 893
+    },
+    {
+      "epoch": 894.0,
+      "grad_norm": 0.02135421149432659,
+      "learning_rate": 0.00023555555555555556,
+      "loss": 0.0028,
+      "step": 894
+    },
+    {
+      "epoch": 895.0,
+      "grad_norm": 0.018956074491143227,
+      "learning_rate": 0.00023333333333333333,
+      "loss": 0.0026,
+      "step": 895
+    },
+    {
+      "epoch": 896.0,
+      "grad_norm": 0.024466995149850845,
+      "learning_rate": 0.0002311111111111111,
+      "loss": 0.0028,
+      "step": 896
+    },
+    {
+      "epoch": 897.0,
+      "grad_norm": 0.024282222613692284,
+      "learning_rate": 0.0002288888888888889,
+      "loss": 0.0028,
+      "step": 897
+    },
+    {
+      "epoch": 898.0,
+      "grad_norm": 0.02457410655915737,
+      "learning_rate": 0.00022666666666666666,
+      "loss": 0.0028,
+      "step": 898
+    },
+    {
+      "epoch": 899.0,
+      "grad_norm": 0.025316430255770683,
+      "learning_rate": 0.00022444444444444446,
+      "loss": 0.0028,
+      "step": 899
+    },
+    {
+      "epoch": 900.0,
+      "grad_norm": 0.015992436558008194,
+      "learning_rate": 0.0002222222222222222,
+      "loss": 0.0027,
+      "step": 900
+    },
+    {
+      "epoch": 901.0,
+      "grad_norm": 0.01664648950099945,
+      "learning_rate": 0.00022,
+      "loss": 0.0026,
+      "step": 901
+    },
+    {
+      "epoch": 902.0,
+      "grad_norm": 0.019682608544826508,
+      "learning_rate": 0.00021777777777777776,
+      "loss": 0.0027,
+      "step": 902
+    },
+    {
+      "epoch": 903.0,
+      "grad_norm": 0.024491267278790474,
+      "learning_rate": 0.00021555555555555556,
+      "loss": 0.0031,
+      "step": 903
+    },
+    {
+      "epoch": 904.0,
+      "grad_norm": 0.024832140654325485,
+      "learning_rate": 0.00021333333333333336,
+      "loss": 0.0027,
+      "step": 904
+    },
+    {
+      "epoch": 905.0,
+      "grad_norm": 0.019126785919070244,
+      "learning_rate": 0.0002111111111111111,
+      "loss": 0.0027,
+      "step": 905
+    },
+    {
+      "epoch": 906.0,
+      "grad_norm": 0.024714525789022446,
+      "learning_rate": 0.0002088888888888889,
+      "loss": 0.003,
+      "step": 906
+    },
+    {
+      "epoch": 907.0,
+      "grad_norm": 0.018799038603901863,
+      "learning_rate": 0.00020666666666666666,
+      "loss": 0.0025,
+      "step": 907
+    },
+    {
+      "epoch": 908.0,
+      "grad_norm": 0.014316687360405922,
+      "learning_rate": 0.00020444444444444446,
+      "loss": 0.0025,
+      "step": 908
+    },
+    {
+      "epoch": 909.0,
+      "grad_norm": 0.0304707158356905,
+      "learning_rate": 0.00020222222222222223,
+      "loss": 0.0028,
+      "step": 909
+    },
+    {
+      "epoch": 910.0,
+      "grad_norm": 0.018442662432789803,
+      "learning_rate": 0.0002,
+      "loss": 0.0026,
+      "step": 910
+    },
+    {
+      "epoch": 911.0,
+      "grad_norm": 0.02719203568994999,
+      "learning_rate": 0.00019777777777777778,
+      "loss": 0.003,
+      "step": 911
+    },
+    {
+      "epoch": 912.0,
+      "grad_norm": 0.02310093119740486,
+      "learning_rate": 0.00019555555555555556,
+      "loss": 0.0028,
+      "step": 912
+    },
+    {
+      "epoch": 913.0,
+      "grad_norm": 0.017705217003822327,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 0.0027,
+      "step": 913
+    },
+    {
+      "epoch": 914.0,
+      "grad_norm": 0.017214365303516388,
+      "learning_rate": 0.00019111111111111114,
+      "loss": 0.0026,
+      "step": 914
+    },
+    {
+      "epoch": 915.0,
+      "grad_norm": 0.020950743928551674,
+      "learning_rate": 0.00018888888888888888,
+      "loss": 0.0027,
+      "step": 915
+    },
+    {
+      "epoch": 916.0,
+      "grad_norm": 0.018532825633883476,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.0026,
+      "step": 916
+    },
+    {
+      "epoch": 917.0,
+      "grad_norm": 0.014188375324010849,
+      "learning_rate": 0.00018444444444444443,
+      "loss": 0.0028,
+      "step": 917
+    },
+    {
+      "epoch": 918.0,
+      "grad_norm": 0.025212949141860008,
+      "learning_rate": 0.00018222222222222224,
+      "loss": 0.003,
+      "step": 918
+    },
+    {
+      "epoch": 919.0,
+      "grad_norm": 0.02256765589118004,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.0027,
+      "step": 919
+    },
+    {
+      "epoch": 920.0,
+      "grad_norm": 0.015511687844991684,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 0.0027,
+      "step": 920
+    },
+    {
+      "epoch": 921.0,
+      "grad_norm": 0.02182592637836933,
+      "learning_rate": 0.00017555555555555556,
+      "loss": 0.0029,
+      "step": 921
+    },
+    {
+      "epoch": 922.0,
+      "grad_norm": 0.016262182965874672,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.0026,
+      "step": 922
+    },
+    {
+      "epoch": 923.0,
+      "grad_norm": 0.0173965897411108,
+      "learning_rate": 0.0001711111111111111,
+      "loss": 0.0027,
+      "step": 923
+    },
+    {
+      "epoch": 924.0,
+      "grad_norm": 0.022845404222607613,
+      "learning_rate": 0.00016888888888888889,
+      "loss": 0.0028,
+      "step": 924
+    },
+    {
+      "epoch": 925.0,
+      "grad_norm": 0.019500279799103737,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.0025,
+      "step": 925
+    },
+    {
+      "epoch": 926.0,
+      "grad_norm": 0.033996641635894775,
+      "learning_rate": 0.00016444444444444446,
+      "loss": 0.003,
+      "step": 926
+    },
+    {
+      "epoch": 927.0,
+      "grad_norm": 0.02097196690738201,
+      "learning_rate": 0.0001622222222222222,
+      "loss": 0.0027,
+      "step": 927
+    },
+    {
+      "epoch": 928.0,
+      "grad_norm": 0.027539506554603577,
+      "learning_rate": 0.00016,
+      "loss": 0.0033,
+      "step": 928
+    },
+    {
+      "epoch": 929.0,
+      "grad_norm": 0.015689266845583916,
+      "learning_rate": 0.00015777777777777776,
+      "loss": 0.0026,
+      "step": 929
+    },
+    {
+      "epoch": 930.0,
+      "grad_norm": 0.020680051296949387,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 0.0028,
+      "step": 930
+    },
+    {
+      "epoch": 931.0,
+      "grad_norm": 0.02494923025369644,
+      "learning_rate": 0.00015333333333333334,
+      "loss": 0.0029,
+      "step": 931
+    },
+    {
+      "epoch": 932.0,
+      "grad_norm": 0.028578734025359154,
+      "learning_rate": 0.0001511111111111111,
+      "loss": 0.003,
+      "step": 932
+    },
+    {
+      "epoch": 933.0,
+      "grad_norm": 0.029307426884770393,
+      "learning_rate": 0.0001488888888888889,
+      "loss": 0.0029,
+      "step": 933
+    },
+    {
+      "epoch": 934.0,
+      "grad_norm": 0.02381393313407898,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.0028,
+      "step": 934
+    },
+    {
+      "epoch": 935.0,
+      "grad_norm": 0.013117119669914246,
+      "learning_rate": 0.00014444444444444444,
+      "loss": 0.0026,
+      "step": 935
+    },
+    {
+      "epoch": 936.0,
+      "grad_norm": 0.028397388756275177,
+      "learning_rate": 0.0001422222222222222,
+      "loss": 0.0027,
+      "step": 936
+    },
+    {
+      "epoch": 937.0,
+      "grad_norm": 0.021766725927591324,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.0028,
+      "step": 937
+    },
+    {
+      "epoch": 938.0,
+      "grad_norm": 0.019310174509882927,
+      "learning_rate": 0.0001377777777777778,
+      "loss": 0.0027,
+      "step": 938
+    },
+    {
+      "epoch": 939.0,
+      "grad_norm": 0.0176254715770483,
+      "learning_rate": 0.00013555555555555556,
+      "loss": 0.0027,
+      "step": 939
+    },
+    {
+      "epoch": 940.0,
+      "grad_norm": 0.026609305292367935,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.0029,
+      "step": 940
+    },
+    {
+      "epoch": 941.0,
+      "grad_norm": 0.01736409030854702,
+      "learning_rate": 0.00013111111111111111,
+      "loss": 0.0027,
+      "step": 941
+    },
+    {
+      "epoch": 942.0,
+      "grad_norm": 0.020236855372786522,
+      "learning_rate": 0.0001288888888888889,
+      "loss": 0.0028,
+      "step": 942
+    },
+    {
+      "epoch": 943.0,
+      "grad_norm": 0.01565195992588997,
+      "learning_rate": 0.0001266666666666667,
+      "loss": 0.0028,
+      "step": 943
+    },
+    {
+      "epoch": 944.0,
+      "grad_norm": 0.02295234240591526,
+      "learning_rate": 0.00012444444444444444,
+      "loss": 0.0028,
+      "step": 944
+    },
+    {
+      "epoch": 945.0,
+      "grad_norm": 0.016273394227027893,
+      "learning_rate": 0.00012222222222222221,
+      "loss": 0.0025,
+      "step": 945
+    },
+    {
+      "epoch": 946.0,
+      "grad_norm": 0.021817076951265335,
+      "learning_rate": 0.00012,
+      "loss": 0.0028,
+      "step": 946
+    },
+    {
+      "epoch": 947.0,
+      "grad_norm": 0.02048509754240513,
+      "learning_rate": 0.00011777777777777778,
+      "loss": 0.0026,
+      "step": 947
+    },
+    {
+      "epoch": 948.0,
+      "grad_norm": 0.024927016347646713,
+      "learning_rate": 0.00011555555555555555,
+      "loss": 0.0028,
+      "step": 948
+    },
+    {
+      "epoch": 949.0,
+      "grad_norm": 0.014938845299184322,
+      "learning_rate": 0.00011333333333333333,
+      "loss": 0.0026,
+      "step": 949
+    },
+    {
+      "epoch": 950.0,
+      "grad_norm": 0.018117714673280716,
+      "learning_rate": 0.0001111111111111111,
+      "loss": 0.0027,
+      "step": 950
+    },
+    {
+      "epoch": 951.0,
+      "grad_norm": 0.020745469257235527,
+      "learning_rate": 0.00010888888888888888,
+      "loss": 0.0027,
+      "step": 951
+    },
+    {
+      "epoch": 952.0,
+      "grad_norm": 0.013773414306342602,
+      "learning_rate": 0.00010666666666666668,
+      "loss": 0.0025,
+      "step": 952
+    },
+    {
+      "epoch": 953.0,
+      "grad_norm": 0.023852935060858727,
+      "learning_rate": 0.00010444444444444445,
+      "loss": 0.0028,
+      "step": 953
+    },
+    {
+      "epoch": 954.0,
+      "grad_norm": 0.023745089769363403,
+      "learning_rate": 0.00010222222222222223,
+      "loss": 0.0028,
+      "step": 954
+    },
+    {
+      "epoch": 955.0,
+      "grad_norm": 0.027273166924715042,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 955
+    },
+    {
+      "epoch": 956.0,
+      "grad_norm": 0.020344140008091927,
+      "learning_rate": 9.777777777777778e-05,
+      "loss": 0.003,
+      "step": 956
+    },
+    {
+      "epoch": 957.0,
+      "grad_norm": 0.017448484897613525,
+      "learning_rate": 9.555555555555557e-05,
+      "loss": 0.0027,
+      "step": 957
+    },
+    {
+      "epoch": 958.0,
+      "grad_norm": 0.016183484345674515,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.0026,
+      "step": 958
+    },
+    {
+      "epoch": 959.0,
+      "grad_norm": 0.01649181731045246,
+      "learning_rate": 9.111111111111112e-05,
+      "loss": 0.0027,
+      "step": 959
+    },
+    {
+      "epoch": 960.0,
+      "grad_norm": 0.024920279160141945,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 0.0027,
+      "step": 960
+    },
+    {
+      "epoch": 961.0,
+      "grad_norm": 0.013914654962718487,
+      "learning_rate": 8.666666666666667e-05,
+      "loss": 0.0024,
+      "step": 961
+    },
+    {
+      "epoch": 962.0,
+      "grad_norm": 0.021497434005141258,
+      "learning_rate": 8.444444444444444e-05,
+      "loss": 0.0026,
+      "step": 962
+    },
+    {
+      "epoch": 963.0,
+      "grad_norm": 0.019979996606707573,
+      "learning_rate": 8.222222222222223e-05,
+      "loss": 0.0029,
+      "step": 963
+    },
+    {
+      "epoch": 964.0,
+      "grad_norm": 0.02189183235168457,
+      "learning_rate": 8e-05,
+      "loss": 0.0028,
+      "step": 964
+    },
+    {
+      "epoch": 965.0,
+      "grad_norm": 0.015944723039865494,
+      "learning_rate": 7.777777777777778e-05,
+      "loss": 0.0026,
+      "step": 965
+    },
+    {
+      "epoch": 966.0,
+      "grad_norm": 0.01600065268576145,
+      "learning_rate": 7.555555555555556e-05,
+      "loss": 0.0025,
+      "step": 966
+    },
+    {
+      "epoch": 967.0,
+      "grad_norm": 0.020630113780498505,
+      "learning_rate": 7.333333333333333e-05,
+      "loss": 0.0027,
+      "step": 967
+    },
+    {
+      "epoch": 968.0,
+      "grad_norm": 0.014975383877754211,
+      "learning_rate": 7.11111111111111e-05,
+      "loss": 0.0025,
+      "step": 968
+    },
+    {
+      "epoch": 969.0,
+      "grad_norm": 0.016374630853533745,
+      "learning_rate": 6.88888888888889e-05,
+      "loss": 0.0026,
+      "step": 969
+    },
+    {
+      "epoch": 970.0,
+      "grad_norm": 0.019182894378900528,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.0027,
+      "step": 970
+    },
+    {
+      "epoch": 971.0,
+      "grad_norm": 0.024381978437304497,
+      "learning_rate": 6.444444444444444e-05,
+      "loss": 0.0028,
+      "step": 971
+    },
+    {
+      "epoch": 972.0,
+      "grad_norm": 0.019862636923789978,
+      "learning_rate": 6.222222222222222e-05,
+      "loss": 0.0026,
+      "step": 972
+    },
+    {
+      "epoch": 973.0,
+      "grad_norm": 0.019189875572919846,
+      "learning_rate": 6e-05,
+      "loss": 0.0025,
+      "step": 973
+    },
+    {
+      "epoch": 974.0,
+      "grad_norm": 0.012594843283295631,
+      "learning_rate": 5.7777777777777776e-05,
+      "loss": 0.0024,
+      "step": 974
+    },
+    {
+      "epoch": 975.0,
+      "grad_norm": 0.01766464300453663,
+      "learning_rate": 5.555555555555555e-05,
+      "loss": 0.0025,
+      "step": 975
+    },
+    {
+      "epoch": 976.0,
+      "grad_norm": 0.016625959426164627,
+      "learning_rate": 5.333333333333334e-05,
+      "loss": 0.0025,
+      "step": 976
+    },
+    {
+      "epoch": 977.0,
+      "grad_norm": 0.01728684827685356,
+      "learning_rate": 5.1111111111111115e-05,
+      "loss": 0.0027,
+      "step": 977
+    },
+    {
+      "epoch": 978.0,
+      "grad_norm": 0.017300087958574295,
+      "learning_rate": 4.888888888888889e-05,
+      "loss": 0.0026,
+      "step": 978
+    },
+    {
+      "epoch": 979.0,
+      "grad_norm": 0.024032112210989,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 0.0028,
+      "step": 979
+    },
+    {
+      "epoch": 980.0,
+      "grad_norm": 0.018220216035842896,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 0.0026,
+      "step": 980
+    },
+    {
+      "epoch": 981.0,
+      "grad_norm": 0.015681209042668343,
+      "learning_rate": 4.222222222222222e-05,
+      "loss": 0.0025,
+      "step": 981
+    },
+    {
+      "epoch": 982.0,
+      "grad_norm": 0.0216491911560297,
+      "learning_rate": 4e-05,
+      "loss": 0.0029,
+      "step": 982
+    },
+    {
+      "epoch": 983.0,
+      "grad_norm": 0.021082593128085136,
+      "learning_rate": 3.777777777777778e-05,
+      "loss": 0.0025,
+      "step": 983
+    },
+    {
+      "epoch": 984.0,
+      "grad_norm": 0.02241634391248226,
+      "learning_rate": 3.555555555555555e-05,
+      "loss": 0.0026,
+      "step": 984
+    },
+    {
+      "epoch": 985.0,
+      "grad_norm": 0.018041379749774933,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.0025,
+      "step": 985
+    },
+    {
+      "epoch": 986.0,
+      "grad_norm": 0.01738720014691353,
+      "learning_rate": 3.111111111111111e-05,
+      "loss": 0.0025,
+      "step": 986
+    },
+    {
+      "epoch": 987.0,
+      "grad_norm": 0.02450176514685154,
+      "learning_rate": 2.8888888888888888e-05,
+      "loss": 0.0026,
+      "step": 987
+    },
+    {
+      "epoch": 988.0,
+      "grad_norm": 0.016677524894475937,
+      "learning_rate": 2.666666666666667e-05,
+      "loss": 0.0025,
+      "step": 988
+    },
+    {
+      "epoch": 989.0,
+      "grad_norm": 0.014843937940895557,
+      "learning_rate": 2.4444444444444445e-05,
+      "loss": 0.0025,
+      "step": 989
+    },
+    {
+      "epoch": 990.0,
+      "grad_norm": 0.017436960712075233,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 0.0027,
+      "step": 990
+    },
+    {
+      "epoch": 991.0,
+      "grad_norm": 0.02031978964805603,
+      "learning_rate": 2e-05,
+      "loss": 0.0026,
+      "step": 991
+    },
+    {
+      "epoch": 992.0,
+      "grad_norm": 0.018474267795681953,
+      "learning_rate": 1.7777777777777777e-05,
+      "loss": 0.0026,
+      "step": 992
+    },
+    {
+      "epoch": 993.0,
+      "grad_norm": 0.021300526335835457,
+      "learning_rate": 1.5555555555555555e-05,
+      "loss": 0.0026,
+      "step": 993
+    },
+    {
+      "epoch": 994.0,
+      "grad_norm": 0.0179448164999485,
+      "learning_rate": 1.3333333333333335e-05,
+      "loss": 0.0028,
+      "step": 994
+    },
+    {
+      "epoch": 995.0,
+      "grad_norm": 0.024742020294070244,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 0.0028,
+      "step": 995
+    },
+    {
+      "epoch": 996.0,
+      "grad_norm": 0.02364485338330269,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 0.0028,
+      "step": 996
+    },
+    {
+      "epoch": 997.0,
+      "grad_norm": 0.021595612168312073,
+      "learning_rate": 6.6666666666666675e-06,
+      "loss": 0.0027,
+      "step": 997
+    },
+    {
+      "epoch": 998.0,
+      "grad_norm": 0.020215950906276703,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.0026,
+      "step": 998
+    },
+    {
+      "epoch": 999.0,
+      "grad_norm": 0.02067585475742817,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 0.0027,
+      "step": 999
+    },
+    {
+      "epoch": 1000.0,
+      "grad_norm": 0.01625109650194645,
+      "learning_rate": 0.0,
+      "loss": 0.0026,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1000,
+  "save_steps": 500,
+  "total_flos": 5525183397888000.0,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}