{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013157894736842105, "grad_norm": 0.9943248451862411, "learning_rate": 8.771929824561404e-07, "loss": 1.3563, "step": 1 }, { "epoch": 0.002631578947368421, "grad_norm": 1.0356516381932348, "learning_rate": 1.7543859649122807e-06, "loss": 1.3798, "step": 2 }, { "epoch": 0.003947368421052632, "grad_norm": 0.9909513605669136, "learning_rate": 2.631578947368421e-06, "loss": 1.3634, "step": 3 }, { "epoch": 0.005263157894736842, "grad_norm": 1.008401142445123, "learning_rate": 3.5087719298245615e-06, "loss": 1.39, "step": 4 }, { "epoch": 0.006578947368421052, "grad_norm": 0.9851935697509421, "learning_rate": 4.3859649122807014e-06, "loss": 1.366, "step": 5 }, { "epoch": 0.007894736842105263, "grad_norm": 0.9899431502727841, "learning_rate": 5.263157894736842e-06, "loss": 1.3624, "step": 6 }, { "epoch": 0.009210526315789473, "grad_norm": 0.9636771026547807, "learning_rate": 6.140350877192982e-06, "loss": 1.3356, "step": 7 }, { "epoch": 0.010526315789473684, "grad_norm": 0.9342738734933147, "learning_rate": 7.017543859649123e-06, "loss": 1.3189, "step": 8 }, { "epoch": 0.011842105263157895, "grad_norm": 0.9962701021709995, "learning_rate": 7.894736842105263e-06, "loss": 1.3505, "step": 9 }, { "epoch": 0.013157894736842105, "grad_norm": 0.942842160351429, "learning_rate": 8.771929824561403e-06, "loss": 1.3352, "step": 10 }, { "epoch": 0.014473684210526316, "grad_norm": 0.926592433083301, "learning_rate": 9.649122807017545e-06, "loss": 1.3057, "step": 11 }, { "epoch": 0.015789473684210527, "grad_norm": 0.8649365529618144, "learning_rate": 1.0526315789473684e-05, "loss": 1.2733, "step": 12 }, { "epoch": 0.017105263157894738, "grad_norm": 0.8589185686142112, "learning_rate": 1.1403508771929824e-05, "loss": 1.2501, "step": 13 }, { "epoch": 0.018421052631578946, "grad_norm": 0.7827884394796245, "learning_rate": 1.2280701754385964e-05, "loss": 1.2434, "step": 14 }, { "epoch": 0.019736842105263157, "grad_norm": 0.7242909203107426, "learning_rate": 1.3157894736842106e-05, "loss": 1.2158, "step": 15 }, { "epoch": 0.021052631578947368, "grad_norm": 0.6258453259636476, "learning_rate": 1.4035087719298246e-05, "loss": 1.174, "step": 16 }, { "epoch": 0.02236842105263158, "grad_norm": 0.581184200984528, "learning_rate": 1.4912280701754386e-05, "loss": 1.1096, "step": 17 }, { "epoch": 0.02368421052631579, "grad_norm": 0.5605953225429362, "learning_rate": 1.5789473684210526e-05, "loss": 1.1116, "step": 18 }, { "epoch": 0.025, "grad_norm": 0.5408018123073384, "learning_rate": 1.6666666666666667e-05, "loss": 1.0664, "step": 19 }, { "epoch": 0.02631578947368421, "grad_norm": 0.5521377988070432, "learning_rate": 1.7543859649122806e-05, "loss": 1.0341, "step": 20 }, { "epoch": 0.02763157894736842, "grad_norm": 0.5881737820139201, "learning_rate": 1.8421052631578947e-05, "loss": 0.994, "step": 21 }, { "epoch": 0.02894736842105263, "grad_norm": 0.635129231733668, "learning_rate": 1.929824561403509e-05, "loss": 0.9912, "step": 22 }, { "epoch": 0.030263157894736843, "grad_norm": 0.6075255860392289, "learning_rate": 2.0175438596491227e-05, "loss": 0.9355, "step": 23 }, { "epoch": 0.031578947368421054, "grad_norm": 0.6242744152482619, "learning_rate": 2.105263157894737e-05, "loss": 0.8811, "step": 24 }, { "epoch": 0.03289473684210526, "grad_norm": 0.5870365386284072, "learning_rate": 2.1929824561403507e-05, "loss": 0.8533, "step": 25 }, { "epoch": 0.034210526315789476, "grad_norm": 0.59700551817252, "learning_rate": 2.280701754385965e-05, "loss": 0.8276, "step": 26 }, { "epoch": 0.035526315789473684, "grad_norm": 0.5437014014509646, "learning_rate": 2.368421052631579e-05, "loss": 0.8294, "step": 27 }, { "epoch": 0.03684210526315789, "grad_norm": 0.5524675830958683, "learning_rate": 2.456140350877193e-05, "loss": 0.7643, "step": 28 }, { "epoch": 0.038157894736842106, "grad_norm": 0.5314274045755922, "learning_rate": 2.5438596491228074e-05, "loss": 0.7262, "step": 29 }, { "epoch": 0.039473684210526314, "grad_norm": 0.47225342909039525, "learning_rate": 2.6315789473684212e-05, "loss": 0.6754, "step": 30 }, { "epoch": 0.04078947368421053, "grad_norm": 0.44700447040118774, "learning_rate": 2.7192982456140354e-05, "loss": 0.6517, "step": 31 }, { "epoch": 0.042105263157894736, "grad_norm": 0.4497880712541435, "learning_rate": 2.8070175438596492e-05, "loss": 0.5831, "step": 32 }, { "epoch": 0.04342105263157895, "grad_norm": 0.3556582069847081, "learning_rate": 2.8947368421052634e-05, "loss": 0.6016, "step": 33 }, { "epoch": 0.04473684210526316, "grad_norm": 0.26076585757651716, "learning_rate": 2.9824561403508772e-05, "loss": 0.578, "step": 34 }, { "epoch": 0.046052631578947366, "grad_norm": 0.2673773341905264, "learning_rate": 3.0701754385964913e-05, "loss": 0.5595, "step": 35 }, { "epoch": 0.04736842105263158, "grad_norm": 0.2456506716030528, "learning_rate": 3.157894736842105e-05, "loss": 0.5533, "step": 36 }, { "epoch": 0.04868421052631579, "grad_norm": 0.22542446425364246, "learning_rate": 3.24561403508772e-05, "loss": 0.5515, "step": 37 }, { "epoch": 0.05, "grad_norm": 0.22177522330874816, "learning_rate": 3.3333333333333335e-05, "loss": 0.5391, "step": 38 }, { "epoch": 0.05131578947368421, "grad_norm": 0.2205640272585472, "learning_rate": 3.421052631578947e-05, "loss": 0.5154, "step": 39 }, { "epoch": 0.05263157894736842, "grad_norm": 0.1985886053937116, "learning_rate": 3.508771929824561e-05, "loss": 0.4966, "step": 40 }, { "epoch": 0.05394736842105263, "grad_norm": 0.17484163584959408, "learning_rate": 3.5964912280701756e-05, "loss": 0.5112, "step": 41 }, { "epoch": 0.05526315789473684, "grad_norm": 0.2050282059933558, "learning_rate": 3.6842105263157895e-05, "loss": 0.5032, "step": 42 }, { "epoch": 0.056578947368421055, "grad_norm": 0.21199590152167688, "learning_rate": 3.771929824561404e-05, "loss": 0.4952, "step": 43 }, { "epoch": 0.05789473684210526, "grad_norm": 0.17849681012135296, "learning_rate": 3.859649122807018e-05, "loss": 0.4918, "step": 44 }, { "epoch": 0.05921052631578947, "grad_norm": 0.2019786429643155, "learning_rate": 3.9473684210526316e-05, "loss": 0.4886, "step": 45 }, { "epoch": 0.060526315789473685, "grad_norm": 0.20225571639981366, "learning_rate": 4.0350877192982455e-05, "loss": 0.4926, "step": 46 }, { "epoch": 0.06184210526315789, "grad_norm": 0.1777962019167991, "learning_rate": 4.12280701754386e-05, "loss": 0.47, "step": 47 }, { "epoch": 0.06315789473684211, "grad_norm": 0.15246060517847743, "learning_rate": 4.210526315789474e-05, "loss": 0.4767, "step": 48 }, { "epoch": 0.06447368421052632, "grad_norm": 0.14898464152134963, "learning_rate": 4.298245614035088e-05, "loss": 0.4713, "step": 49 }, { "epoch": 0.06578947368421052, "grad_norm": 0.13773290443803046, "learning_rate": 4.3859649122807014e-05, "loss": 0.4743, "step": 50 }, { "epoch": 0.06710526315789474, "grad_norm": 0.1408929769423494, "learning_rate": 4.473684210526316e-05, "loss": 0.4501, "step": 51 }, { "epoch": 0.06842105263157895, "grad_norm": 0.1368756686186497, "learning_rate": 4.56140350877193e-05, "loss": 0.4526, "step": 52 }, { "epoch": 0.06973684210526315, "grad_norm": 0.13392249182682364, "learning_rate": 4.649122807017544e-05, "loss": 0.4347, "step": 53 }, { "epoch": 0.07105263157894737, "grad_norm": 0.13153597559965788, "learning_rate": 4.736842105263158e-05, "loss": 0.4631, "step": 54 }, { "epoch": 0.07236842105263158, "grad_norm": 0.1345131441878225, "learning_rate": 4.824561403508772e-05, "loss": 0.4495, "step": 55 }, { "epoch": 0.07368421052631578, "grad_norm": 0.13825684252520345, "learning_rate": 4.912280701754386e-05, "loss": 0.4393, "step": 56 }, { "epoch": 0.075, "grad_norm": 0.11938240156936133, "learning_rate": 5e-05, "loss": 0.4216, "step": 57 }, { "epoch": 0.07631578947368421, "grad_norm": 0.12724516953784049, "learning_rate": 5.087719298245615e-05, "loss": 0.4387, "step": 58 }, { "epoch": 0.07763157894736843, "grad_norm": 0.1202517377574683, "learning_rate": 5.1754385964912286e-05, "loss": 0.4375, "step": 59 }, { "epoch": 0.07894736842105263, "grad_norm": 0.11502282485893392, "learning_rate": 5.2631578947368424e-05, "loss": 0.43, "step": 60 }, { "epoch": 0.08026315789473684, "grad_norm": 0.11833243534328311, "learning_rate": 5.350877192982456e-05, "loss": 0.4502, "step": 61 }, { "epoch": 0.08157894736842106, "grad_norm": 0.1162576327218189, "learning_rate": 5.438596491228071e-05, "loss": 0.4122, "step": 62 }, { "epoch": 0.08289473684210526, "grad_norm": 0.11646926324993098, "learning_rate": 5.526315789473685e-05, "loss": 0.4281, "step": 63 }, { "epoch": 0.08421052631578947, "grad_norm": 0.10805987411499896, "learning_rate": 5.6140350877192984e-05, "loss": 0.4222, "step": 64 }, { "epoch": 0.08552631578947369, "grad_norm": 0.11176891376288024, "learning_rate": 5.701754385964912e-05, "loss": 0.416, "step": 65 }, { "epoch": 0.0868421052631579, "grad_norm": 0.12524757529866518, "learning_rate": 5.789473684210527e-05, "loss": 0.4047, "step": 66 }, { "epoch": 0.0881578947368421, "grad_norm": 0.11006405173241035, "learning_rate": 5.877192982456141e-05, "loss": 0.4266, "step": 67 }, { "epoch": 0.08947368421052632, "grad_norm": 0.12250978018085709, "learning_rate": 5.9649122807017544e-05, "loss": 0.4265, "step": 68 }, { "epoch": 0.09078947368421053, "grad_norm": 0.11723491918915026, "learning_rate": 6.052631578947369e-05, "loss": 0.4173, "step": 69 }, { "epoch": 0.09210526315789473, "grad_norm": 0.11327108284284121, "learning_rate": 6.140350877192983e-05, "loss": 0.428, "step": 70 }, { "epoch": 0.09342105263157895, "grad_norm": 0.11624261730248432, "learning_rate": 6.228070175438597e-05, "loss": 0.4181, "step": 71 }, { "epoch": 0.09473684210526316, "grad_norm": 0.1097767429764226, "learning_rate": 6.31578947368421e-05, "loss": 0.4172, "step": 72 }, { "epoch": 0.09605263157894736, "grad_norm": 0.10948304839484718, "learning_rate": 6.403508771929825e-05, "loss": 0.4114, "step": 73 }, { "epoch": 0.09736842105263158, "grad_norm": 0.11381746923579758, "learning_rate": 6.49122807017544e-05, "loss": 0.4138, "step": 74 }, { "epoch": 0.09868421052631579, "grad_norm": 0.11879309263584711, "learning_rate": 6.578947368421054e-05, "loss": 0.4234, "step": 75 }, { "epoch": 0.1, "grad_norm": 0.11504992983557438, "learning_rate": 6.666666666666667e-05, "loss": 0.4069, "step": 76 }, { "epoch": 0.1013157894736842, "grad_norm": 0.1289568391364873, "learning_rate": 6.754385964912281e-05, "loss": 0.4172, "step": 77 }, { "epoch": 0.10263157894736842, "grad_norm": 0.1027991596519212, "learning_rate": 6.842105263157895e-05, "loss": 0.4064, "step": 78 }, { "epoch": 0.10394736842105264, "grad_norm": 0.1703158129160332, "learning_rate": 6.929824561403509e-05, "loss": 0.4146, "step": 79 }, { "epoch": 0.10526315789473684, "grad_norm": 0.10978980803904749, "learning_rate": 7.017543859649122e-05, "loss": 0.3953, "step": 80 }, { "epoch": 0.10657894736842105, "grad_norm": 0.11715465011733911, "learning_rate": 7.105263157894737e-05, "loss": 0.4195, "step": 81 }, { "epoch": 0.10789473684210527, "grad_norm": 0.12584313901221594, "learning_rate": 7.192982456140351e-05, "loss": 0.4095, "step": 82 }, { "epoch": 0.10921052631578948, "grad_norm": 0.11940093441042002, "learning_rate": 7.280701754385966e-05, "loss": 0.3918, "step": 83 }, { "epoch": 0.11052631578947368, "grad_norm": 0.12190899565903182, "learning_rate": 7.368421052631579e-05, "loss": 0.3912, "step": 84 }, { "epoch": 0.1118421052631579, "grad_norm": 0.13982192975495122, "learning_rate": 7.456140350877193e-05, "loss": 0.3741, "step": 85 }, { "epoch": 0.11315789473684211, "grad_norm": 0.11868632446195661, "learning_rate": 7.543859649122808e-05, "loss": 0.4007, "step": 86 }, { "epoch": 0.11447368421052631, "grad_norm": 0.12861533906094824, "learning_rate": 7.631578947368422e-05, "loss": 0.4054, "step": 87 }, { "epoch": 0.11578947368421053, "grad_norm": 0.11712395700046184, "learning_rate": 7.719298245614036e-05, "loss": 0.3942, "step": 88 }, { "epoch": 0.11710526315789474, "grad_norm": 0.11911543233284826, "learning_rate": 7.807017543859649e-05, "loss": 0.3992, "step": 89 }, { "epoch": 0.11842105263157894, "grad_norm": 0.12597614512349814, "learning_rate": 7.894736842105263e-05, "loss": 0.3868, "step": 90 }, { "epoch": 0.11973684210526316, "grad_norm": 0.11923027766958509, "learning_rate": 7.982456140350878e-05, "loss": 0.3994, "step": 91 }, { "epoch": 0.12105263157894737, "grad_norm": 0.11704509885009143, "learning_rate": 8.070175438596491e-05, "loss": 0.3813, "step": 92 }, { "epoch": 0.12236842105263158, "grad_norm": 0.1415178300748815, "learning_rate": 8.157894736842105e-05, "loss": 0.3995, "step": 93 }, { "epoch": 0.12368421052631579, "grad_norm": 0.12356686708530311, "learning_rate": 8.24561403508772e-05, "loss": 0.385, "step": 94 }, { "epoch": 0.125, "grad_norm": 0.12008298974430553, "learning_rate": 8.333333333333334e-05, "loss": 0.3849, "step": 95 }, { "epoch": 0.12631578947368421, "grad_norm": 0.11678986714389192, "learning_rate": 8.421052631578948e-05, "loss": 0.3763, "step": 96 }, { "epoch": 0.12763157894736843, "grad_norm": 0.130987416202565, "learning_rate": 8.508771929824562e-05, "loss": 0.3795, "step": 97 }, { "epoch": 0.12894736842105264, "grad_norm": 0.13585638455268312, "learning_rate": 8.596491228070177e-05, "loss": 0.3714, "step": 98 }, { "epoch": 0.13026315789473683, "grad_norm": 0.1343330070878276, "learning_rate": 8.68421052631579e-05, "loss": 0.3686, "step": 99 }, { "epoch": 0.13157894736842105, "grad_norm": 0.1344794137365553, "learning_rate": 8.771929824561403e-05, "loss": 0.3963, "step": 100 }, { "epoch": 0.13289473684210526, "grad_norm": 0.13882732763835956, "learning_rate": 8.859649122807017e-05, "loss": 0.3759, "step": 101 }, { "epoch": 0.13421052631578947, "grad_norm": 0.13720598333651746, "learning_rate": 8.947368421052632e-05, "loss": 0.3821, "step": 102 }, { "epoch": 0.1355263157894737, "grad_norm": 0.14777326263342197, "learning_rate": 9.035087719298246e-05, "loss": 0.3667, "step": 103 }, { "epoch": 0.1368421052631579, "grad_norm": 0.13337880990510198, "learning_rate": 9.12280701754386e-05, "loss": 0.3978, "step": 104 }, { "epoch": 0.13815789473684212, "grad_norm": 0.14435581876340933, "learning_rate": 9.210526315789474e-05, "loss": 0.3751, "step": 105 }, { "epoch": 0.1394736842105263, "grad_norm": 0.1307662742116557, "learning_rate": 9.298245614035089e-05, "loss": 0.3898, "step": 106 }, { "epoch": 0.14078947368421052, "grad_norm": 0.13020669111992744, "learning_rate": 9.385964912280703e-05, "loss": 0.3904, "step": 107 }, { "epoch": 0.14210526315789473, "grad_norm": 0.14245838078165218, "learning_rate": 9.473684210526316e-05, "loss": 0.3814, "step": 108 }, { "epoch": 0.14342105263157895, "grad_norm": 0.13114307459864877, "learning_rate": 9.56140350877193e-05, "loss": 0.374, "step": 109 }, { "epoch": 0.14473684210526316, "grad_norm": 0.12257270456238782, "learning_rate": 9.649122807017544e-05, "loss": 0.3835, "step": 110 }, { "epoch": 0.14605263157894738, "grad_norm": 0.1377457018530249, "learning_rate": 9.736842105263158e-05, "loss": 0.3772, "step": 111 }, { "epoch": 0.14736842105263157, "grad_norm": 0.13906509029436523, "learning_rate": 9.824561403508771e-05, "loss": 0.3777, "step": 112 }, { "epoch": 0.14868421052631578, "grad_norm": 0.13702341765645296, "learning_rate": 9.912280701754386e-05, "loss": 0.3778, "step": 113 }, { "epoch": 0.15, "grad_norm": 0.131545294973937, "learning_rate": 0.0001, "loss": 0.3676, "step": 114 }, { "epoch": 0.1513157894736842, "grad_norm": 0.144839378587314, "learning_rate": 0.00010087719298245615, "loss": 0.3656, "step": 115 }, { "epoch": 0.15263157894736842, "grad_norm": 0.13801404126106712, "learning_rate": 0.0001017543859649123, "loss": 0.3698, "step": 116 }, { "epoch": 0.15394736842105264, "grad_norm": 0.14250664033740984, "learning_rate": 0.00010263157894736844, "loss": 0.3556, "step": 117 }, { "epoch": 0.15526315789473685, "grad_norm": 0.14495828716442766, "learning_rate": 0.00010350877192982457, "loss": 0.3874, "step": 118 }, { "epoch": 0.15657894736842104, "grad_norm": 0.1458656065808899, "learning_rate": 0.0001043859649122807, "loss": 0.3707, "step": 119 }, { "epoch": 0.15789473684210525, "grad_norm": 0.15169186829219217, "learning_rate": 0.00010526315789473685, "loss": 0.3807, "step": 120 }, { "epoch": 0.15921052631578947, "grad_norm": 0.1384837433981043, "learning_rate": 0.00010614035087719298, "loss": 0.3606, "step": 121 }, { "epoch": 0.16052631578947368, "grad_norm": 0.1397696554751311, "learning_rate": 0.00010701754385964912, "loss": 0.3703, "step": 122 }, { "epoch": 0.1618421052631579, "grad_norm": 0.14890695623711342, "learning_rate": 0.00010789473684210527, "loss": 0.3613, "step": 123 }, { "epoch": 0.1631578947368421, "grad_norm": 0.15703056192380713, "learning_rate": 0.00010877192982456141, "loss": 0.3504, "step": 124 }, { "epoch": 0.16447368421052633, "grad_norm": 0.1392335253666652, "learning_rate": 0.00010964912280701756, "loss": 0.3708, "step": 125 }, { "epoch": 0.16578947368421051, "grad_norm": 0.14687002166712368, "learning_rate": 0.0001105263157894737, "loss": 0.3698, "step": 126 }, { "epoch": 0.16710526315789473, "grad_norm": 0.15870994148978584, "learning_rate": 0.00011140350877192982, "loss": 0.3779, "step": 127 }, { "epoch": 0.16842105263157894, "grad_norm": 0.14754629529682614, "learning_rate": 0.00011228070175438597, "loss": 0.3623, "step": 128 }, { "epoch": 0.16973684210526316, "grad_norm": 0.15124044879001075, "learning_rate": 0.00011315789473684211, "loss": 0.3608, "step": 129 }, { "epoch": 0.17105263157894737, "grad_norm": 0.14959917991128074, "learning_rate": 0.00011403508771929824, "loss": 0.3642, "step": 130 }, { "epoch": 0.1723684210526316, "grad_norm": 0.1571767924920792, "learning_rate": 0.00011491228070175439, "loss": 0.3631, "step": 131 }, { "epoch": 0.1736842105263158, "grad_norm": 0.14490254678965067, "learning_rate": 0.00011578947368421053, "loss": 0.3559, "step": 132 }, { "epoch": 0.175, "grad_norm": 0.144229202921289, "learning_rate": 0.00011666666666666668, "loss": 0.3546, "step": 133 }, { "epoch": 0.1763157894736842, "grad_norm": 0.1600537686886715, "learning_rate": 0.00011754385964912282, "loss": 0.3455, "step": 134 }, { "epoch": 0.17763157894736842, "grad_norm": 0.1313150257696866, "learning_rate": 0.00011842105263157894, "loss": 0.3552, "step": 135 }, { "epoch": 0.17894736842105263, "grad_norm": 0.13732726897904224, "learning_rate": 0.00011929824561403509, "loss": 0.3497, "step": 136 }, { "epoch": 0.18026315789473685, "grad_norm": 0.1463571482716898, "learning_rate": 0.00012017543859649123, "loss": 0.3519, "step": 137 }, { "epoch": 0.18157894736842106, "grad_norm": 0.1424850899462053, "learning_rate": 0.00012105263157894738, "loss": 0.3475, "step": 138 }, { "epoch": 0.18289473684210528, "grad_norm": 0.15624853614650133, "learning_rate": 0.00012192982456140352, "loss": 0.3541, "step": 139 }, { "epoch": 0.18421052631578946, "grad_norm": 0.15391671545839364, "learning_rate": 0.00012280701754385965, "loss": 0.347, "step": 140 }, { "epoch": 0.18552631578947368, "grad_norm": 0.1476455568748672, "learning_rate": 0.0001236842105263158, "loss": 0.3641, "step": 141 }, { "epoch": 0.1868421052631579, "grad_norm": 0.15837728677358026, "learning_rate": 0.00012456140350877194, "loss": 0.3542, "step": 142 }, { "epoch": 0.1881578947368421, "grad_norm": 0.1387983783021263, "learning_rate": 0.00012543859649122806, "loss": 0.3425, "step": 143 }, { "epoch": 0.18947368421052632, "grad_norm": 0.15375599222156547, "learning_rate": 0.0001263157894736842, "loss": 0.353, "step": 144 }, { "epoch": 0.19078947368421054, "grad_norm": 0.14286343766479473, "learning_rate": 0.00012719298245614035, "loss": 0.3425, "step": 145 }, { "epoch": 0.19210526315789472, "grad_norm": 0.15500148844719516, "learning_rate": 0.0001280701754385965, "loss": 0.3536, "step": 146 }, { "epoch": 0.19342105263157894, "grad_norm": 0.1527274581882464, "learning_rate": 0.00012894736842105264, "loss": 0.3529, "step": 147 }, { "epoch": 0.19473684210526315, "grad_norm": 0.13911203035920786, "learning_rate": 0.0001298245614035088, "loss": 0.3582, "step": 148 }, { "epoch": 0.19605263157894737, "grad_norm": 0.1603641558907569, "learning_rate": 0.00013070175438596493, "loss": 0.3379, "step": 149 }, { "epoch": 0.19736842105263158, "grad_norm": 0.15724848532335062, "learning_rate": 0.00013157894736842108, "loss": 0.3447, "step": 150 }, { "epoch": 0.1986842105263158, "grad_norm": 0.15104286411263596, "learning_rate": 0.0001324561403508772, "loss": 0.3592, "step": 151 }, { "epoch": 0.2, "grad_norm": 0.14256111417333361, "learning_rate": 0.00013333333333333334, "loss": 0.3418, "step": 152 }, { "epoch": 0.2013157894736842, "grad_norm": 0.14616033132964826, "learning_rate": 0.00013421052631578948, "loss": 0.365, "step": 153 }, { "epoch": 0.2026315789473684, "grad_norm": 0.13617570529003223, "learning_rate": 0.00013508771929824563, "loss": 0.3687, "step": 154 }, { "epoch": 0.20394736842105263, "grad_norm": 0.15392252191058636, "learning_rate": 0.00013596491228070177, "loss": 0.3481, "step": 155 }, { "epoch": 0.20526315789473684, "grad_norm": 0.14731210087429397, "learning_rate": 0.0001368421052631579, "loss": 0.3578, "step": 156 }, { "epoch": 0.20657894736842106, "grad_norm": 0.15561784274185514, "learning_rate": 0.00013771929824561404, "loss": 0.3592, "step": 157 }, { "epoch": 0.20789473684210527, "grad_norm": 0.15066657618186907, "learning_rate": 0.00013859649122807018, "loss": 0.3467, "step": 158 }, { "epoch": 0.20921052631578949, "grad_norm": 0.1420611399655039, "learning_rate": 0.0001394736842105263, "loss": 0.3616, "step": 159 }, { "epoch": 0.21052631578947367, "grad_norm": 0.1488250715622558, "learning_rate": 0.00014035087719298245, "loss": 0.339, "step": 160 }, { "epoch": 0.2118421052631579, "grad_norm": 0.1462212684656958, "learning_rate": 0.0001412280701754386, "loss": 0.3414, "step": 161 }, { "epoch": 0.2131578947368421, "grad_norm": 0.13807950981607273, "learning_rate": 0.00014210526315789474, "loss": 0.3342, "step": 162 }, { "epoch": 0.21447368421052632, "grad_norm": 0.15320658283775204, "learning_rate": 0.00014298245614035088, "loss": 0.3448, "step": 163 }, { "epoch": 0.21578947368421053, "grad_norm": 0.1465532353318612, "learning_rate": 0.00014385964912280703, "loss": 0.3472, "step": 164 }, { "epoch": 0.21710526315789475, "grad_norm": 0.14706679089630642, "learning_rate": 0.00014473684210526317, "loss": 0.3437, "step": 165 }, { "epoch": 0.21842105263157896, "grad_norm": 0.15624893484875924, "learning_rate": 0.00014561403508771932, "loss": 0.3417, "step": 166 }, { "epoch": 0.21973684210526315, "grad_norm": 0.14729900208480234, "learning_rate": 0.00014649122807017543, "loss": 0.3651, "step": 167 }, { "epoch": 0.22105263157894736, "grad_norm": 0.13340629262561993, "learning_rate": 0.00014736842105263158, "loss": 0.3452, "step": 168 }, { "epoch": 0.22236842105263158, "grad_norm": 0.1427638316007593, "learning_rate": 0.00014824561403508772, "loss": 0.3368, "step": 169 }, { "epoch": 0.2236842105263158, "grad_norm": 0.1394836492366252, "learning_rate": 0.00014912280701754387, "loss": 0.3366, "step": 170 }, { "epoch": 0.225, "grad_norm": 0.12982465074656047, "learning_rate": 0.00015000000000000001, "loss": 0.3266, "step": 171 }, { "epoch": 0.22631578947368422, "grad_norm": 0.1489587612015448, "learning_rate": 0.00015087719298245616, "loss": 0.3342, "step": 172 }, { "epoch": 0.22763157894736843, "grad_norm": 0.15658289578441603, "learning_rate": 0.0001517543859649123, "loss": 0.3461, "step": 173 }, { "epoch": 0.22894736842105262, "grad_norm": 0.13299000630905108, "learning_rate": 0.00015263157894736845, "loss": 0.3343, "step": 174 }, { "epoch": 0.23026315789473684, "grad_norm": 0.1501297297839127, "learning_rate": 0.00015350877192982457, "loss": 0.3319, "step": 175 }, { "epoch": 0.23157894736842105, "grad_norm": 0.1447458092769113, "learning_rate": 0.0001543859649122807, "loss": 0.3538, "step": 176 }, { "epoch": 0.23289473684210527, "grad_norm": 0.12642159107807327, "learning_rate": 0.00015526315789473686, "loss": 0.3507, "step": 177 }, { "epoch": 0.23421052631578948, "grad_norm": 0.1307467882888706, "learning_rate": 0.00015614035087719297, "loss": 0.3352, "step": 178 }, { "epoch": 0.2355263157894737, "grad_norm": 0.14575680367060406, "learning_rate": 0.00015701754385964912, "loss": 0.3415, "step": 179 }, { "epoch": 0.23684210526315788, "grad_norm": 0.13772178353175255, "learning_rate": 0.00015789473684210527, "loss": 0.3517, "step": 180 }, { "epoch": 0.2381578947368421, "grad_norm": 0.13177565526904855, "learning_rate": 0.0001587719298245614, "loss": 0.326, "step": 181 }, { "epoch": 0.2394736842105263, "grad_norm": 0.13895296380351296, "learning_rate": 0.00015964912280701756, "loss": 0.3418, "step": 182 }, { "epoch": 0.24078947368421053, "grad_norm": 0.13234030291334134, "learning_rate": 0.0001605263157894737, "loss": 0.3492, "step": 183 }, { "epoch": 0.24210526315789474, "grad_norm": 0.1343103364584722, "learning_rate": 0.00016140350877192982, "loss": 0.334, "step": 184 }, { "epoch": 0.24342105263157895, "grad_norm": 0.13330977836549726, "learning_rate": 0.00016228070175438596, "loss": 0.342, "step": 185 }, { "epoch": 0.24473684210526317, "grad_norm": 0.12933054253662082, "learning_rate": 0.0001631578947368421, "loss": 0.3476, "step": 186 }, { "epoch": 0.24605263157894736, "grad_norm": 0.12637139899067856, "learning_rate": 0.00016403508771929825, "loss": 0.3317, "step": 187 }, { "epoch": 0.24736842105263157, "grad_norm": 0.12268533782351379, "learning_rate": 0.0001649122807017544, "loss": 0.3369, "step": 188 }, { "epoch": 0.24868421052631579, "grad_norm": 0.1269784203784765, "learning_rate": 0.00016578947368421054, "loss": 0.3555, "step": 189 }, { "epoch": 0.25, "grad_norm": 0.13455995822795366, "learning_rate": 0.0001666666666666667, "loss": 0.3352, "step": 190 }, { "epoch": 0.2513157894736842, "grad_norm": 0.13042417788105207, "learning_rate": 0.00016754385964912283, "loss": 0.3425, "step": 191 }, { "epoch": 0.25263157894736843, "grad_norm": 0.13259270895700348, "learning_rate": 0.00016842105263157895, "loss": 0.3388, "step": 192 }, { "epoch": 0.25394736842105264, "grad_norm": 0.13229060651188568, "learning_rate": 0.0001692982456140351, "loss": 0.3444, "step": 193 }, { "epoch": 0.25526315789473686, "grad_norm": 0.1409002440562877, "learning_rate": 0.00017017543859649124, "loss": 0.326, "step": 194 }, { "epoch": 0.2565789473684211, "grad_norm": 0.13331986255382416, "learning_rate": 0.00017105263157894739, "loss": 0.3333, "step": 195 }, { "epoch": 0.2578947368421053, "grad_norm": 0.13211357309903607, "learning_rate": 0.00017192982456140353, "loss": 0.3303, "step": 196 }, { "epoch": 0.25921052631578945, "grad_norm": 0.13392529140137704, "learning_rate": 0.00017280701754385965, "loss": 0.3266, "step": 197 }, { "epoch": 0.26052631578947366, "grad_norm": 0.135716179073066, "learning_rate": 0.0001736842105263158, "loss": 0.3372, "step": 198 }, { "epoch": 0.2618421052631579, "grad_norm": 0.1325252534873712, "learning_rate": 0.00017456140350877194, "loss": 0.3377, "step": 199 }, { "epoch": 0.2631578947368421, "grad_norm": 0.13891964420010164, "learning_rate": 0.00017543859649122806, "loss": 0.3359, "step": 200 }, { "epoch": 0.2644736842105263, "grad_norm": 0.14101443449526624, "learning_rate": 0.0001763157894736842, "loss": 0.3254, "step": 201 }, { "epoch": 0.2657894736842105, "grad_norm": 0.12637174218931757, "learning_rate": 0.00017719298245614035, "loss": 0.3372, "step": 202 }, { "epoch": 0.26710526315789473, "grad_norm": 0.13174772610887958, "learning_rate": 0.0001780701754385965, "loss": 0.3365, "step": 203 }, { "epoch": 0.26842105263157895, "grad_norm": 0.13681575981535574, "learning_rate": 0.00017894736842105264, "loss": 0.3356, "step": 204 }, { "epoch": 0.26973684210526316, "grad_norm": 0.1328471909403007, "learning_rate": 0.00017982456140350878, "loss": 0.33, "step": 205 }, { "epoch": 0.2710526315789474, "grad_norm": 0.1407266391635488, "learning_rate": 0.00018070175438596493, "loss": 0.3391, "step": 206 }, { "epoch": 0.2723684210526316, "grad_norm": 0.1348033610448726, "learning_rate": 0.00018157894736842107, "loss": 0.3239, "step": 207 }, { "epoch": 0.2736842105263158, "grad_norm": 0.1265119642350682, "learning_rate": 0.0001824561403508772, "loss": 0.3345, "step": 208 }, { "epoch": 0.275, "grad_norm": 0.12645375412506446, "learning_rate": 0.00018333333333333334, "loss": 0.3241, "step": 209 }, { "epoch": 0.27631578947368424, "grad_norm": 0.13126982713178578, "learning_rate": 0.00018421052631578948, "loss": 0.3367, "step": 210 }, { "epoch": 0.2776315789473684, "grad_norm": 0.1243720842002383, "learning_rate": 0.00018508771929824563, "loss": 0.329, "step": 211 }, { "epoch": 0.2789473684210526, "grad_norm": 0.14402206242169863, "learning_rate": 0.00018596491228070177, "loss": 0.3321, "step": 212 }, { "epoch": 0.2802631578947368, "grad_norm": 0.12463209754003, "learning_rate": 0.00018684210526315792, "loss": 0.3191, "step": 213 }, { "epoch": 0.28157894736842104, "grad_norm": 0.11980526105079334, "learning_rate": 0.00018771929824561406, "loss": 0.3238, "step": 214 }, { "epoch": 0.28289473684210525, "grad_norm": 0.13443132852719006, "learning_rate": 0.0001885964912280702, "loss": 0.3409, "step": 215 }, { "epoch": 0.28421052631578947, "grad_norm": 0.12477177793237498, "learning_rate": 0.00018947368421052632, "loss": 0.334, "step": 216 }, { "epoch": 0.2855263157894737, "grad_norm": 0.12434521968333248, "learning_rate": 0.00019035087719298247, "loss": 0.3301, "step": 217 }, { "epoch": 0.2868421052631579, "grad_norm": 0.12926013045173962, "learning_rate": 0.0001912280701754386, "loss": 0.3403, "step": 218 }, { "epoch": 0.2881578947368421, "grad_norm": 0.12767169045744933, "learning_rate": 0.00019210526315789473, "loss": 0.3285, "step": 219 }, { "epoch": 0.2894736842105263, "grad_norm": 0.12496998663175744, "learning_rate": 0.00019298245614035088, "loss": 0.3485, "step": 220 }, { "epoch": 0.29078947368421054, "grad_norm": 0.12044174136139364, "learning_rate": 0.00019385964912280702, "loss": 0.3264, "step": 221 }, { "epoch": 0.29210526315789476, "grad_norm": 0.13199190001366287, "learning_rate": 0.00019473684210526317, "loss": 0.3422, "step": 222 }, { "epoch": 0.29342105263157897, "grad_norm": 0.1201369924836855, "learning_rate": 0.0001956140350877193, "loss": 0.3244, "step": 223 }, { "epoch": 0.29473684210526313, "grad_norm": 0.11423784430905021, "learning_rate": 0.00019649122807017543, "loss": 0.3347, "step": 224 }, { "epoch": 0.29605263157894735, "grad_norm": 0.12466053172472819, "learning_rate": 0.00019736842105263157, "loss": 0.3309, "step": 225 }, { "epoch": 0.29736842105263156, "grad_norm": 0.12442689437527811, "learning_rate": 0.00019824561403508772, "loss": 0.3354, "step": 226 }, { "epoch": 0.2986842105263158, "grad_norm": 0.11914676425566721, "learning_rate": 0.00019912280701754386, "loss": 0.3367, "step": 227 }, { "epoch": 0.3, "grad_norm": 0.12114432656113984, "learning_rate": 0.0002, "loss": 0.3225, "step": 228 }, { "epoch": 0.3013157894736842, "grad_norm": 0.12708865662155097, "learning_rate": 0.00019999988280341633, "loss": 0.3267, "step": 229 }, { "epoch": 0.3026315789473684, "grad_norm": 0.12093461278364943, "learning_rate": 0.00019999953121394002, "loss": 0.3341, "step": 230 }, { "epoch": 0.30394736842105263, "grad_norm": 0.1259425626165656, "learning_rate": 0.00019999894523239515, "loss": 0.3352, "step": 231 }, { "epoch": 0.30526315789473685, "grad_norm": 0.11881152231373332, "learning_rate": 0.00019999812486015523, "loss": 0.3263, "step": 232 }, { "epoch": 0.30657894736842106, "grad_norm": 0.13525301351060123, "learning_rate": 0.00019999707009914317, "loss": 0.3405, "step": 233 }, { "epoch": 0.3078947368421053, "grad_norm": 0.12155988934380389, "learning_rate": 0.00019999578095183124, "loss": 0.3264, "step": 234 }, { "epoch": 0.3092105263157895, "grad_norm": 0.12309971712704007, "learning_rate": 0.00019999425742124114, "loss": 0.3126, "step": 235 }, { "epoch": 0.3105263157894737, "grad_norm": 0.12559904653470885, "learning_rate": 0.00019999249951094388, "loss": 0.3436, "step": 236 }, { "epoch": 0.3118421052631579, "grad_norm": 0.11464091226244318, "learning_rate": 0.00019999050722505993, "loss": 0.3212, "step": 237 }, { "epoch": 0.3131578947368421, "grad_norm": 0.11798628745307271, "learning_rate": 0.000199988280568259, "loss": 0.3187, "step": 238 }, { "epoch": 0.3144736842105263, "grad_norm": 0.11937212399519956, "learning_rate": 0.00019998581954576032, "loss": 0.3261, "step": 239 }, { "epoch": 0.3157894736842105, "grad_norm": 0.10973860755841888, "learning_rate": 0.00019998312416333227, "loss": 0.3097, "step": 240 }, { "epoch": 0.3171052631578947, "grad_norm": 0.11870178537603401, "learning_rate": 0.00019998019442729273, "loss": 0.3264, "step": 241 }, { "epoch": 0.31842105263157894, "grad_norm": 0.11706254926047537, "learning_rate": 0.0001999770303445087, "loss": 0.3276, "step": 242 }, { "epoch": 0.31973684210526315, "grad_norm": 0.12369716377452905, "learning_rate": 0.00019997363192239664, "loss": 0.3178, "step": 243 }, { "epoch": 0.32105263157894737, "grad_norm": 0.10989749847863158, "learning_rate": 0.0001999699991689222, "loss": 0.3126, "step": 244 }, { "epoch": 0.3223684210526316, "grad_norm": 0.11488770235752266, "learning_rate": 0.00019996613209260033, "loss": 0.3305, "step": 245 }, { "epoch": 0.3236842105263158, "grad_norm": 0.11550955957920862, "learning_rate": 0.00019996203070249516, "loss": 0.3395, "step": 246 }, { "epoch": 0.325, "grad_norm": 0.10896874824994508, "learning_rate": 0.0001999576950082201, "loss": 0.3266, "step": 247 }, { "epoch": 0.3263157894736842, "grad_norm": 0.11429922245859891, "learning_rate": 0.00019995312501993765, "loss": 0.3241, "step": 248 }, { "epoch": 0.32763157894736844, "grad_norm": 0.11185855088004702, "learning_rate": 0.00019994832074835963, "loss": 0.3215, "step": 249 }, { "epoch": 0.32894736842105265, "grad_norm": 0.10618407427160971, "learning_rate": 0.00019994328220474688, "loss": 0.3114, "step": 250 }, { "epoch": 0.33026315789473687, "grad_norm": 0.10827179543543274, "learning_rate": 0.00019993800940090942, "loss": 0.3275, "step": 251 }, { "epoch": 0.33157894736842103, "grad_norm": 0.11600822718215488, "learning_rate": 0.00019993250234920636, "loss": 0.3205, "step": 252 }, { "epoch": 0.33289473684210524, "grad_norm": 0.11660805309271545, "learning_rate": 0.00019992676106254584, "loss": 0.338, "step": 253 }, { "epoch": 0.33421052631578946, "grad_norm": 0.11992791956747861, "learning_rate": 0.00019992078555438502, "loss": 0.33, "step": 254 }, { "epoch": 0.3355263157894737, "grad_norm": 0.11271215008229082, "learning_rate": 0.0001999145758387301, "loss": 0.3312, "step": 255 }, { "epoch": 0.3368421052631579, "grad_norm": 0.10842995849313372, "learning_rate": 0.00019990813193013625, "loss": 0.3257, "step": 256 }, { "epoch": 0.3381578947368421, "grad_norm": 0.10874370438818506, "learning_rate": 0.0001999014538437075, "loss": 0.3128, "step": 257 }, { "epoch": 0.3394736842105263, "grad_norm": 0.11809880440680368, "learning_rate": 0.0001998945415950969, "loss": 0.3211, "step": 258 }, { "epoch": 0.34078947368421053, "grad_norm": 0.11661765420730584, "learning_rate": 0.00019988739520050618, "loss": 0.3148, "step": 259 }, { "epoch": 0.34210526315789475, "grad_norm": 0.10343508953195514, "learning_rate": 0.0001998800146766861, "loss": 0.3217, "step": 260 }, { "epoch": 0.34342105263157896, "grad_norm": 0.11172563699747126, "learning_rate": 0.0001998724000409361, "loss": 0.3098, "step": 261 }, { "epoch": 0.3447368421052632, "grad_norm": 0.11124141647463505, "learning_rate": 0.00019986455131110428, "loss": 0.3177, "step": 262 }, { "epoch": 0.3460526315789474, "grad_norm": 0.1049604206863391, "learning_rate": 0.00019985646850558764, "loss": 0.3202, "step": 263 }, { "epoch": 0.3473684210526316, "grad_norm": 0.11674285217463065, "learning_rate": 0.00019984815164333163, "loss": 0.3325, "step": 264 }, { "epoch": 0.34868421052631576, "grad_norm": 0.10939260536399624, "learning_rate": 0.00019983960074383046, "loss": 0.3135, "step": 265 }, { "epoch": 0.35, "grad_norm": 0.11228143225921142, "learning_rate": 0.00019983081582712685, "loss": 0.3184, "step": 266 }, { "epoch": 0.3513157894736842, "grad_norm": 0.11736606638037488, "learning_rate": 0.000199821796913812, "loss": 0.3228, "step": 267 }, { "epoch": 0.3526315789473684, "grad_norm": 0.10870195278237296, "learning_rate": 0.00019981254402502566, "loss": 0.3162, "step": 268 }, { "epoch": 0.3539473684210526, "grad_norm": 0.10319062608987703, "learning_rate": 0.000199803057182456, "loss": 0.3088, "step": 269 }, { "epoch": 0.35526315789473684, "grad_norm": 0.11157466968877176, "learning_rate": 0.00019979333640833947, "loss": 0.3157, "step": 270 }, { "epoch": 0.35657894736842105, "grad_norm": 0.11150128423279522, "learning_rate": 0.00019978338172546093, "loss": 0.3101, "step": 271 }, { "epoch": 0.35789473684210527, "grad_norm": 0.1085421577182214, "learning_rate": 0.0001997731931571535, "loss": 0.3126, "step": 272 }, { "epoch": 0.3592105263157895, "grad_norm": 0.10475914894230995, "learning_rate": 0.00019976277072729845, "loss": 0.3224, "step": 273 }, { "epoch": 0.3605263157894737, "grad_norm": 0.1082831141347499, "learning_rate": 0.00019975211446032526, "loss": 0.3082, "step": 274 }, { "epoch": 0.3618421052631579, "grad_norm": 0.10740486742911703, "learning_rate": 0.0001997412243812115, "loss": 0.3257, "step": 275 }, { "epoch": 0.3631578947368421, "grad_norm": 0.1020992105394808, "learning_rate": 0.00019973010051548275, "loss": 0.3156, "step": 276 }, { "epoch": 0.36447368421052634, "grad_norm": 0.10963434549263805, "learning_rate": 0.0001997187428892126, "loss": 0.312, "step": 277 }, { "epoch": 0.36578947368421055, "grad_norm": 0.10738473907360331, "learning_rate": 0.00019970715152902254, "loss": 0.3264, "step": 278 }, { "epoch": 0.3671052631578947, "grad_norm": 0.10172040132714297, "learning_rate": 0.00019969532646208195, "loss": 0.3, "step": 279 }, { "epoch": 0.3684210526315789, "grad_norm": 0.10348844172942914, "learning_rate": 0.00019968326771610797, "loss": 0.3116, "step": 280 }, { "epoch": 0.36973684210526314, "grad_norm": 0.11435260003026458, "learning_rate": 0.00019967097531936546, "loss": 0.3196, "step": 281 }, { "epoch": 0.37105263157894736, "grad_norm": 0.1121081703641299, "learning_rate": 0.000199658449300667, "loss": 0.3226, "step": 282 }, { "epoch": 0.37236842105263157, "grad_norm": 0.1067537506801231, "learning_rate": 0.00019964568968937267, "loss": 0.3027, "step": 283 }, { "epoch": 0.3736842105263158, "grad_norm": 0.11350941919576825, "learning_rate": 0.00019963269651539017, "loss": 0.3328, "step": 284 }, { "epoch": 0.375, "grad_norm": 0.10315345770730597, "learning_rate": 0.00019961946980917456, "loss": 0.3007, "step": 285 }, { "epoch": 0.3763157894736842, "grad_norm": 0.11217086438246132, "learning_rate": 0.0001996060096017284, "loss": 0.307, "step": 286 }, { "epoch": 0.37763157894736843, "grad_norm": 0.10830051369447247, "learning_rate": 0.00019959231592460143, "loss": 0.3128, "step": 287 }, { "epoch": 0.37894736842105264, "grad_norm": 0.11069575850173559, "learning_rate": 0.00019957838880989078, "loss": 0.3096, "step": 288 }, { "epoch": 0.38026315789473686, "grad_norm": 0.1085222475355183, "learning_rate": 0.00019956422829024055, "loss": 0.315, "step": 289 }, { "epoch": 0.3815789473684211, "grad_norm": 0.09952894778052519, "learning_rate": 0.0001995498343988421, "loss": 0.3101, "step": 290 }, { "epoch": 0.3828947368421053, "grad_norm": 0.11065531030038152, "learning_rate": 0.00019953520716943371, "loss": 0.3088, "step": 291 }, { "epoch": 0.38421052631578945, "grad_norm": 0.11350469293630033, "learning_rate": 0.00019952034663630062, "loss": 0.318, "step": 292 }, { "epoch": 0.38552631578947366, "grad_norm": 0.11212904813152226, "learning_rate": 0.00019950525283427491, "loss": 0.3172, "step": 293 }, { "epoch": 0.3868421052631579, "grad_norm": 0.11122338935927506, "learning_rate": 0.00019948992579873538, "loss": 0.3058, "step": 294 }, { "epoch": 0.3881578947368421, "grad_norm": 0.11166549208778644, "learning_rate": 0.0001994743655656076, "loss": 0.3014, "step": 295 }, { "epoch": 0.3894736842105263, "grad_norm": 0.11276484187285273, "learning_rate": 0.00019945857217136363, "loss": 0.3177, "step": 296 }, { "epoch": 0.3907894736842105, "grad_norm": 0.111642408366338, "learning_rate": 0.00019944254565302217, "loss": 0.3077, "step": 297 }, { "epoch": 0.39210526315789473, "grad_norm": 0.12415010602542942, "learning_rate": 0.00019942628604814825, "loss": 0.3109, "step": 298 }, { "epoch": 0.39342105263157895, "grad_norm": 0.10701081689961081, "learning_rate": 0.00019940979339485332, "loss": 0.3027, "step": 299 }, { "epoch": 0.39473684210526316, "grad_norm": 0.1036095978911461, "learning_rate": 0.00019939306773179497, "loss": 0.3155, "step": 300 }, { "epoch": 0.3960526315789474, "grad_norm": 0.12314325764301473, "learning_rate": 0.00019937610909817702, "loss": 0.3197, "step": 301 }, { "epoch": 0.3973684210526316, "grad_norm": 0.10965174611997063, "learning_rate": 0.0001993589175337494, "loss": 0.3055, "step": 302 }, { "epoch": 0.3986842105263158, "grad_norm": 0.10691327868633747, "learning_rate": 0.00019934149307880791, "loss": 0.3047, "step": 303 }, { "epoch": 0.4, "grad_norm": 0.11048488675527199, "learning_rate": 0.00019932383577419432, "loss": 0.3162, "step": 304 }, { "epoch": 0.40131578947368424, "grad_norm": 0.115786509511319, "learning_rate": 0.0001993059456612961, "loss": 0.2998, "step": 305 }, { "epoch": 0.4026315789473684, "grad_norm": 0.10197052941221454, "learning_rate": 0.0001992878227820465, "loss": 0.3167, "step": 306 }, { "epoch": 0.4039473684210526, "grad_norm": 0.10818823997320136, "learning_rate": 0.00019926946717892428, "loss": 0.3175, "step": 307 }, { "epoch": 0.4052631578947368, "grad_norm": 0.11647859718654414, "learning_rate": 0.00019925087889495374, "loss": 0.3223, "step": 308 }, { "epoch": 0.40657894736842104, "grad_norm": 0.10764841122975913, "learning_rate": 0.0001992320579737045, "loss": 0.3134, "step": 309 }, { "epoch": 0.40789473684210525, "grad_norm": 0.10744012941703222, "learning_rate": 0.0001992130044592916, "loss": 0.3072, "step": 310 }, { "epoch": 0.40921052631578947, "grad_norm": 0.11064482894831129, "learning_rate": 0.00019919371839637512, "loss": 0.315, "step": 311 }, { "epoch": 0.4105263157894737, "grad_norm": 0.10891430893886998, "learning_rate": 0.00019917419983016025, "loss": 0.3016, "step": 312 }, { "epoch": 0.4118421052631579, "grad_norm": 0.09966222612672655, "learning_rate": 0.0001991544488063972, "loss": 0.3104, "step": 313 }, { "epoch": 0.4131578947368421, "grad_norm": 0.09948289810327957, "learning_rate": 0.00019913446537138106, "loss": 0.3199, "step": 314 }, { "epoch": 0.4144736842105263, "grad_norm": 0.10221948739250226, "learning_rate": 0.00019911424957195158, "loss": 0.3027, "step": 315 }, { "epoch": 0.41578947368421054, "grad_norm": 0.10319916825175317, "learning_rate": 0.00019909380145549324, "loss": 0.2962, "step": 316 }, { "epoch": 0.41710526315789476, "grad_norm": 0.10112207458406539, "learning_rate": 0.00019907312106993503, "loss": 0.3072, "step": 317 }, { "epoch": 0.41842105263157897, "grad_norm": 0.1056216300520592, "learning_rate": 0.00019905220846375032, "loss": 0.3108, "step": 318 }, { "epoch": 0.41973684210526313, "grad_norm": 0.10432503631292016, "learning_rate": 0.0001990310636859569, "loss": 0.3084, "step": 319 }, { "epoch": 0.42105263157894735, "grad_norm": 0.10088521223549587, "learning_rate": 0.00019900968678611666, "loss": 0.3094, "step": 320 }, { "epoch": 0.42236842105263156, "grad_norm": 0.09994245819164545, "learning_rate": 0.00019898807781433555, "loss": 0.3142, "step": 321 }, { "epoch": 0.4236842105263158, "grad_norm": 0.09903460972489679, "learning_rate": 0.00019896623682126355, "loss": 0.307, "step": 322 }, { "epoch": 0.425, "grad_norm": 0.09914885914415125, "learning_rate": 0.00019894416385809444, "loss": 0.3014, "step": 323 }, { "epoch": 0.4263157894736842, "grad_norm": 0.10122296873459234, "learning_rate": 0.00019892185897656578, "loss": 0.303, "step": 324 }, { "epoch": 0.4276315789473684, "grad_norm": 0.09910489155114076, "learning_rate": 0.0001988993222289587, "loss": 0.3117, "step": 325 }, { "epoch": 0.42894736842105263, "grad_norm": 0.09241116281014006, "learning_rate": 0.0001988765536680977, "loss": 0.2934, "step": 326 }, { "epoch": 0.43026315789473685, "grad_norm": 0.09770357586471753, "learning_rate": 0.00019885355334735082, "loss": 0.317, "step": 327 }, { "epoch": 0.43157894736842106, "grad_norm": 0.09697496010174222, "learning_rate": 0.00019883032132062925, "loss": 0.2998, "step": 328 }, { "epoch": 0.4328947368421053, "grad_norm": 0.10013197876212468, "learning_rate": 0.0001988068576423872, "loss": 0.3105, "step": 329 }, { "epoch": 0.4342105263157895, "grad_norm": 0.09453550444418382, "learning_rate": 0.00019878316236762196, "loss": 0.3091, "step": 330 }, { "epoch": 0.4355263157894737, "grad_norm": 0.10361296309003387, "learning_rate": 0.00019875923555187365, "loss": 0.3117, "step": 331 }, { "epoch": 0.4368421052631579, "grad_norm": 0.09867438093722174, "learning_rate": 0.00019873507725122504, "loss": 0.3105, "step": 332 }, { "epoch": 0.4381578947368421, "grad_norm": 0.09914171961719935, "learning_rate": 0.00019871068752230162, "loss": 0.3299, "step": 333 }, { "epoch": 0.4394736842105263, "grad_norm": 0.09508589565593037, "learning_rate": 0.00019868606642227122, "loss": 0.3057, "step": 334 }, { "epoch": 0.4407894736842105, "grad_norm": 0.10553369850294496, "learning_rate": 0.00019866121400884397, "loss": 0.2968, "step": 335 }, { "epoch": 0.4421052631578947, "grad_norm": 0.09693152875657689, "learning_rate": 0.00019863613034027224, "loss": 0.3157, "step": 336 }, { "epoch": 0.44342105263157894, "grad_norm": 0.09914036757559133, "learning_rate": 0.0001986108154753505, "loss": 0.3075, "step": 337 }, { "epoch": 0.44473684210526315, "grad_norm": 0.10323372097858888, "learning_rate": 0.00019858526947341497, "loss": 0.316, "step": 338 }, { "epoch": 0.44605263157894737, "grad_norm": 0.1047670642802198, "learning_rate": 0.0001985594923943438, "loss": 0.3016, "step": 339 }, { "epoch": 0.4473684210526316, "grad_norm": 0.10038200225737869, "learning_rate": 0.00019853348429855672, "loss": 0.313, "step": 340 }, { "epoch": 0.4486842105263158, "grad_norm": 0.09995800002881376, "learning_rate": 0.00019850724524701486, "loss": 0.3201, "step": 341 }, { "epoch": 0.45, "grad_norm": 0.09706393223860003, "learning_rate": 0.00019848077530122083, "loss": 0.302, "step": 342 }, { "epoch": 0.4513157894736842, "grad_norm": 0.10217585842187529, "learning_rate": 0.0001984540745232183, "loss": 0.3052, "step": 343 }, { "epoch": 0.45263157894736844, "grad_norm": 0.09716163731483823, "learning_rate": 0.00019842714297559213, "loss": 0.3182, "step": 344 }, { "epoch": 0.45394736842105265, "grad_norm": 0.09580525044842596, "learning_rate": 0.000198399980721468, "loss": 0.3016, "step": 345 }, { "epoch": 0.45526315789473687, "grad_norm": 0.10216214300781075, "learning_rate": 0.0001983725878245124, "loss": 0.3093, "step": 346 }, { "epoch": 0.45657894736842103, "grad_norm": 0.09748580988527811, "learning_rate": 0.0001983449643489324, "loss": 0.3077, "step": 347 }, { "epoch": 0.45789473684210524, "grad_norm": 0.10212802995697862, "learning_rate": 0.0001983171103594755, "loss": 0.3039, "step": 348 }, { "epoch": 0.45921052631578946, "grad_norm": 0.10018192206504958, "learning_rate": 0.00019828902592142962, "loss": 0.3023, "step": 349 }, { "epoch": 0.4605263157894737, "grad_norm": 0.09425076167869935, "learning_rate": 0.0001982607111006227, "loss": 0.3119, "step": 350 }, { "epoch": 0.4618421052631579, "grad_norm": 0.0967215301525356, "learning_rate": 0.0001982321659634228, "loss": 0.3097, "step": 351 }, { "epoch": 0.4631578947368421, "grad_norm": 0.09622422612500942, "learning_rate": 0.0001982033905767377, "loss": 0.3114, "step": 352 }, { "epoch": 0.4644736842105263, "grad_norm": 0.09360899731223273, "learning_rate": 0.00019817438500801502, "loss": 0.3099, "step": 353 }, { "epoch": 0.46578947368421053, "grad_norm": 0.09987640487171251, "learning_rate": 0.0001981451493252418, "loss": 0.2994, "step": 354 }, { "epoch": 0.46710526315789475, "grad_norm": 0.10065235571453063, "learning_rate": 0.00019811568359694447, "loss": 0.3048, "step": 355 }, { "epoch": 0.46842105263157896, "grad_norm": 0.10208391531367922, "learning_rate": 0.00019808598789218865, "loss": 0.2939, "step": 356 }, { "epoch": 0.4697368421052632, "grad_norm": 0.10402670400636525, "learning_rate": 0.00019805606228057916, "loss": 0.3024, "step": 357 }, { "epoch": 0.4710526315789474, "grad_norm": 0.09930311628562899, "learning_rate": 0.00019802590683225946, "loss": 0.3024, "step": 358 }, { "epoch": 0.4723684210526316, "grad_norm": 0.10262536271155684, "learning_rate": 0.0001979955216179119, "loss": 0.3158, "step": 359 }, { "epoch": 0.47368421052631576, "grad_norm": 0.0943881489890398, "learning_rate": 0.0001979649067087574, "loss": 0.3095, "step": 360 }, { "epoch": 0.475, "grad_norm": 0.09827781548848098, "learning_rate": 0.00019793406217655517, "loss": 0.3084, "step": 361 }, { "epoch": 0.4763157894736842, "grad_norm": 0.09396956618890727, "learning_rate": 0.00019790298809360267, "loss": 0.3108, "step": 362 }, { "epoch": 0.4776315789473684, "grad_norm": 0.0971493552146761, "learning_rate": 0.00019787168453273544, "loss": 0.3234, "step": 363 }, { "epoch": 0.4789473684210526, "grad_norm": 0.09635250868834104, "learning_rate": 0.00019784015156732693, "loss": 0.2961, "step": 364 }, { "epoch": 0.48026315789473684, "grad_norm": 0.0957638945230677, "learning_rate": 0.00019780838927128822, "loss": 0.2999, "step": 365 }, { "epoch": 0.48157894736842105, "grad_norm": 0.09673326599521005, "learning_rate": 0.00019777639771906795, "loss": 0.3065, "step": 366 }, { "epoch": 0.48289473684210527, "grad_norm": 0.10037616383525308, "learning_rate": 0.00019774417698565215, "loss": 0.3155, "step": 367 }, { "epoch": 0.4842105263157895, "grad_norm": 0.09978049327969367, "learning_rate": 0.000197711727146564, "loss": 0.3165, "step": 368 }, { "epoch": 0.4855263157894737, "grad_norm": 0.09791958592311975, "learning_rate": 0.00019767904827786375, "loss": 0.3089, "step": 369 }, { "epoch": 0.4868421052631579, "grad_norm": 0.09995882365072112, "learning_rate": 0.00019764614045614836, "loss": 0.2988, "step": 370 }, { "epoch": 0.4881578947368421, "grad_norm": 0.09232234693055072, "learning_rate": 0.0001976130037585516, "loss": 0.3042, "step": 371 }, { "epoch": 0.48947368421052634, "grad_norm": 0.09643481824906593, "learning_rate": 0.00019757963826274357, "loss": 0.294, "step": 372 }, { "epoch": 0.49078947368421055, "grad_norm": 0.09697913699025175, "learning_rate": 0.00019754604404693073, "loss": 0.3053, "step": 373 }, { "epoch": 0.4921052631578947, "grad_norm": 0.09907221334912318, "learning_rate": 0.00019751222118985563, "loss": 0.2968, "step": 374 }, { "epoch": 0.4934210526315789, "grad_norm": 0.09502914913791907, "learning_rate": 0.00019747816977079671, "loss": 0.3078, "step": 375 }, { "epoch": 0.49473684210526314, "grad_norm": 0.09936470649154033, "learning_rate": 0.00019744388986956822, "loss": 0.3097, "step": 376 }, { "epoch": 0.49605263157894736, "grad_norm": 0.09802911974074544, "learning_rate": 0.0001974093815665199, "loss": 0.2958, "step": 377 }, { "epoch": 0.49736842105263157, "grad_norm": 0.09898755664324496, "learning_rate": 0.0001973746449425368, "loss": 0.3083, "step": 378 }, { "epoch": 0.4986842105263158, "grad_norm": 0.0961406374333812, "learning_rate": 0.00019733968007903922, "loss": 0.2976, "step": 379 }, { "epoch": 0.5, "grad_norm": 0.10159753311874659, "learning_rate": 0.00019730448705798239, "loss": 0.3033, "step": 380 }, { "epoch": 0.5013157894736842, "grad_norm": 0.0947441745895011, "learning_rate": 0.0001972690659618564, "loss": 0.2852, "step": 381 }, { "epoch": 0.5026315789473684, "grad_norm": 0.09775748236249004, "learning_rate": 0.00019723341687368583, "loss": 0.2929, "step": 382 }, { "epoch": 0.5039473684210526, "grad_norm": 0.09935691145327614, "learning_rate": 0.00019719753987702978, "loss": 0.3111, "step": 383 }, { "epoch": 0.5052631578947369, "grad_norm": 0.09893488807438079, "learning_rate": 0.0001971614350559814, "loss": 0.3158, "step": 384 }, { "epoch": 0.506578947368421, "grad_norm": 0.0947497912778192, "learning_rate": 0.00019712510249516793, "loss": 0.2921, "step": 385 }, { "epoch": 0.5078947368421053, "grad_norm": 0.08737508895101483, "learning_rate": 0.00019708854227975048, "loss": 0.2927, "step": 386 }, { "epoch": 0.5092105263157894, "grad_norm": 0.09835779795374115, "learning_rate": 0.00019705175449542358, "loss": 0.2976, "step": 387 }, { "epoch": 0.5105263157894737, "grad_norm": 0.09107621488427446, "learning_rate": 0.0001970147392284154, "loss": 0.2814, "step": 388 }, { "epoch": 0.5118421052631579, "grad_norm": 0.09743589157185283, "learning_rate": 0.00019697749656548714, "loss": 0.3003, "step": 389 }, { "epoch": 0.5131578947368421, "grad_norm": 0.09595683633527828, "learning_rate": 0.00019694002659393305, "loss": 0.3033, "step": 390 }, { "epoch": 0.5144736842105263, "grad_norm": 0.09879879361599342, "learning_rate": 0.0001969023294015802, "loss": 0.3133, "step": 391 }, { "epoch": 0.5157894736842106, "grad_norm": 0.10105822465511066, "learning_rate": 0.00019686440507678824, "loss": 0.3103, "step": 392 }, { "epoch": 0.5171052631578947, "grad_norm": 0.10182072904717256, "learning_rate": 0.00019682625370844918, "loss": 0.2978, "step": 393 }, { "epoch": 0.5184210526315789, "grad_norm": 0.09540364410216295, "learning_rate": 0.00019678787538598725, "loss": 0.3243, "step": 394 }, { "epoch": 0.5197368421052632, "grad_norm": 0.09754493311070281, "learning_rate": 0.00019674927019935857, "loss": 0.2957, "step": 395 }, { "epoch": 0.5210526315789473, "grad_norm": 0.09261668278184784, "learning_rate": 0.0001967104382390511, "loss": 0.2909, "step": 396 }, { "epoch": 0.5223684210526316, "grad_norm": 0.0978202914794332, "learning_rate": 0.00019667137959608426, "loss": 0.2875, "step": 397 }, { "epoch": 0.5236842105263158, "grad_norm": 0.09087851403559502, "learning_rate": 0.00019663209436200887, "loss": 0.2959, "step": 398 }, { "epoch": 0.525, "grad_norm": 0.09362515028658314, "learning_rate": 0.00019659258262890683, "loss": 0.3042, "step": 399 }, { "epoch": 0.5263157894736842, "grad_norm": 0.09012777438494002, "learning_rate": 0.00019655284448939094, "loss": 0.2959, "step": 400 }, { "epoch": 0.5276315789473685, "grad_norm": 0.0960827279363291, "learning_rate": 0.0001965128800366047, "loss": 0.2929, "step": 401 }, { "epoch": 0.5289473684210526, "grad_norm": 0.09300199527426295, "learning_rate": 0.00019647268936422206, "loss": 0.283, "step": 402 }, { "epoch": 0.5302631578947369, "grad_norm": 0.09683305560772038, "learning_rate": 0.00019643227256644716, "loss": 0.3105, "step": 403 }, { "epoch": 0.531578947368421, "grad_norm": 0.0954490202804032, "learning_rate": 0.00019639162973801426, "loss": 0.3023, "step": 404 }, { "epoch": 0.5328947368421053, "grad_norm": 0.09612701147402002, "learning_rate": 0.00019635076097418734, "loss": 0.3176, "step": 405 }, { "epoch": 0.5342105263157895, "grad_norm": 0.09104963315940301, "learning_rate": 0.00019630966637076004, "loss": 0.2997, "step": 406 }, { "epoch": 0.5355263157894737, "grad_norm": 0.0986012989783705, "learning_rate": 0.00019626834602405523, "loss": 0.2964, "step": 407 }, { "epoch": 0.5368421052631579, "grad_norm": 0.10336828609236494, "learning_rate": 0.00019622680003092503, "loss": 0.2886, "step": 408 }, { "epoch": 0.5381578947368421, "grad_norm": 0.09141302481564861, "learning_rate": 0.00019618502848875045, "loss": 0.2898, "step": 409 }, { "epoch": 0.5394736842105263, "grad_norm": 0.09811357001705759, "learning_rate": 0.00019614303149544102, "loss": 0.312, "step": 410 }, { "epoch": 0.5407894736842105, "grad_norm": 0.09760884292596834, "learning_rate": 0.00019610080914943492, "loss": 0.31, "step": 411 }, { "epoch": 0.5421052631578948, "grad_norm": 0.09299566319312146, "learning_rate": 0.0001960583615496984, "loss": 0.2892, "step": 412 }, { "epoch": 0.5434210526315789, "grad_norm": 0.09898383275023284, "learning_rate": 0.0001960156887957257, "loss": 0.3005, "step": 413 }, { "epoch": 0.5447368421052632, "grad_norm": 0.10036363231456787, "learning_rate": 0.0001959727909875389, "loss": 0.2882, "step": 414 }, { "epoch": 0.5460526315789473, "grad_norm": 0.08979092246398529, "learning_rate": 0.00019592966822568753, "loss": 0.301, "step": 415 }, { "epoch": 0.5473684210526316, "grad_norm": 0.09422502905792629, "learning_rate": 0.00019588632061124837, "loss": 0.2851, "step": 416 }, { "epoch": 0.5486842105263158, "grad_norm": 0.08769883690329003, "learning_rate": 0.0001958427482458253, "loss": 0.2874, "step": 417 }, { "epoch": 0.55, "grad_norm": 0.10126724866744835, "learning_rate": 0.0001957989512315489, "loss": 0.3068, "step": 418 }, { "epoch": 0.5513157894736842, "grad_norm": 0.09755145401929033, "learning_rate": 0.00019575492967107642, "loss": 0.3072, "step": 419 }, { "epoch": 0.5526315789473685, "grad_norm": 0.09089099824025165, "learning_rate": 0.00019571068366759143, "loss": 0.2891, "step": 420 }, { "epoch": 0.5539473684210526, "grad_norm": 0.0961002056223822, "learning_rate": 0.00019566621332480348, "loss": 0.2935, "step": 421 }, { "epoch": 0.5552631578947368, "grad_norm": 0.09426174969645366, "learning_rate": 0.00019562151874694803, "loss": 0.2925, "step": 422 }, { "epoch": 0.5565789473684211, "grad_norm": 0.09928190504185619, "learning_rate": 0.00019557660003878614, "loss": 0.3055, "step": 423 }, { "epoch": 0.5578947368421052, "grad_norm": 0.09531727011576784, "learning_rate": 0.00019553145730560415, "loss": 0.2916, "step": 424 }, { "epoch": 0.5592105263157895, "grad_norm": 0.09655059432651882, "learning_rate": 0.00019548609065321356, "loss": 0.3058, "step": 425 }, { "epoch": 0.5605263157894737, "grad_norm": 0.09707096390323401, "learning_rate": 0.00019544050018795075, "loss": 0.2873, "step": 426 }, { "epoch": 0.5618421052631579, "grad_norm": 0.09428953984002672, "learning_rate": 0.00019539468601667662, "loss": 0.298, "step": 427 }, { "epoch": 0.5631578947368421, "grad_norm": 0.09658263944025584, "learning_rate": 0.0001953486482467764, "loss": 0.3004, "step": 428 }, { "epoch": 0.5644736842105263, "grad_norm": 0.10052098718968268, "learning_rate": 0.00019530238698615957, "loss": 0.2975, "step": 429 }, { "epoch": 0.5657894736842105, "grad_norm": 0.09350068437861885, "learning_rate": 0.00019525590234325933, "loss": 0.2977, "step": 430 }, { "epoch": 0.5671052631578948, "grad_norm": 0.09265833701591007, "learning_rate": 0.00019520919442703245, "loss": 0.2941, "step": 431 }, { "epoch": 0.5684210526315789, "grad_norm": 0.0914948295571144, "learning_rate": 0.0001951622633469592, "loss": 0.3032, "step": 432 }, { "epoch": 0.5697368421052632, "grad_norm": 0.09538108350867988, "learning_rate": 0.00019511510921304273, "loss": 0.3057, "step": 433 }, { "epoch": 0.5710526315789474, "grad_norm": 0.09388517597881102, "learning_rate": 0.00019506773213580917, "loss": 0.2974, "step": 434 }, { "epoch": 0.5723684210526315, "grad_norm": 0.09853399504304618, "learning_rate": 0.00019502013222630712, "loss": 0.2831, "step": 435 }, { "epoch": 0.5736842105263158, "grad_norm": 0.09228736628869874, "learning_rate": 0.00019497230959610756, "loss": 0.2885, "step": 436 }, { "epoch": 0.575, "grad_norm": 0.09786678573184258, "learning_rate": 0.0001949242643573034, "loss": 0.3082, "step": 437 }, { "epoch": 0.5763157894736842, "grad_norm": 0.09190204992934663, "learning_rate": 0.00019487599662250943, "loss": 0.2974, "step": 438 }, { "epoch": 0.5776315789473684, "grad_norm": 0.09838875478146845, "learning_rate": 0.00019482750650486193, "loss": 0.2869, "step": 439 }, { "epoch": 0.5789473684210527, "grad_norm": 0.09080464891287887, "learning_rate": 0.00019477879411801844, "loss": 0.2999, "step": 440 }, { "epoch": 0.5802631578947368, "grad_norm": 0.094818410464559, "learning_rate": 0.0001947298595761574, "loss": 0.2948, "step": 441 }, { "epoch": 0.5815789473684211, "grad_norm": 0.09229949003285352, "learning_rate": 0.00019468070299397808, "loss": 0.2851, "step": 442 }, { "epoch": 0.5828947368421052, "grad_norm": 0.0946200275878873, "learning_rate": 0.0001946313244867002, "loss": 0.3035, "step": 443 }, { "epoch": 0.5842105263157895, "grad_norm": 0.0972452681474162, "learning_rate": 0.00019458172417006347, "loss": 0.2921, "step": 444 }, { "epoch": 0.5855263157894737, "grad_norm": 0.09168097969668783, "learning_rate": 0.00019453190216032776, "loss": 0.2965, "step": 445 }, { "epoch": 0.5868421052631579, "grad_norm": 0.09238240620410484, "learning_rate": 0.00019448185857427242, "loss": 0.2965, "step": 446 }, { "epoch": 0.5881578947368421, "grad_norm": 0.10748289837029415, "learning_rate": 0.00019443159352919623, "loss": 0.2912, "step": 447 }, { "epoch": 0.5894736842105263, "grad_norm": 0.09871636341616816, "learning_rate": 0.00019438110714291694, "loss": 0.3111, "step": 448 }, { "epoch": 0.5907894736842105, "grad_norm": 0.08861315777205786, "learning_rate": 0.00019433039953377127, "loss": 0.2891, "step": 449 }, { "epoch": 0.5921052631578947, "grad_norm": 0.09570467370902319, "learning_rate": 0.00019427947082061432, "loss": 0.3046, "step": 450 }, { "epoch": 0.593421052631579, "grad_norm": 0.0882411637366609, "learning_rate": 0.00019422832112281962, "loss": 0.2948, "step": 451 }, { "epoch": 0.5947368421052631, "grad_norm": 0.08681692074605786, "learning_rate": 0.00019417695056027844, "loss": 0.2975, "step": 452 }, { "epoch": 0.5960526315789474, "grad_norm": 0.0894965926644007, "learning_rate": 0.00019412535925339997, "loss": 0.2982, "step": 453 }, { "epoch": 0.5973684210526315, "grad_norm": 0.08949478378197007, "learning_rate": 0.00019407354732311064, "loss": 0.2923, "step": 454 }, { "epoch": 0.5986842105263158, "grad_norm": 0.0952879006180906, "learning_rate": 0.0001940215148908541, "loss": 0.2959, "step": 455 }, { "epoch": 0.6, "grad_norm": 0.09341215938159328, "learning_rate": 0.00019396926207859084, "loss": 0.2987, "step": 456 }, { "epoch": 0.6013157894736842, "grad_norm": 0.08989810174624617, "learning_rate": 0.00019391678900879786, "loss": 0.2991, "step": 457 }, { "epoch": 0.6026315789473684, "grad_norm": 0.09052527717801154, "learning_rate": 0.00019386409580446844, "loss": 0.2841, "step": 458 }, { "epoch": 0.6039473684210527, "grad_norm": 0.09246686105449765, "learning_rate": 0.00019381118258911186, "loss": 0.2998, "step": 459 }, { "epoch": 0.6052631578947368, "grad_norm": 0.09632503586933118, "learning_rate": 0.00019375804948675306, "loss": 0.3075, "step": 460 }, { "epoch": 0.6065789473684211, "grad_norm": 0.09476157150253976, "learning_rate": 0.00019370469662193248, "loss": 0.2981, "step": 461 }, { "epoch": 0.6078947368421053, "grad_norm": 0.08742707617370972, "learning_rate": 0.0001936511241197055, "loss": 0.2805, "step": 462 }, { "epoch": 0.6092105263157894, "grad_norm": 0.09283003181432853, "learning_rate": 0.00019359733210564244, "loss": 0.2955, "step": 463 }, { "epoch": 0.6105263157894737, "grad_norm": 0.09458543714491262, "learning_rate": 0.0001935433207058281, "loss": 0.2949, "step": 464 }, { "epoch": 0.6118421052631579, "grad_norm": 0.08784515458042345, "learning_rate": 0.00019348909004686152, "loss": 0.3033, "step": 465 }, { "epoch": 0.6131578947368421, "grad_norm": 0.08813027640751597, "learning_rate": 0.00019343464025585563, "loss": 0.2949, "step": 466 }, { "epoch": 0.6144736842105263, "grad_norm": 0.09671795564332149, "learning_rate": 0.00019337997146043708, "loss": 0.3022, "step": 467 }, { "epoch": 0.6157894736842106, "grad_norm": 0.09555526827290024, "learning_rate": 0.0001933250837887457, "loss": 0.2956, "step": 468 }, { "epoch": 0.6171052631578947, "grad_norm": 0.08589539494059092, "learning_rate": 0.00019326997736943455, "loss": 0.3052, "step": 469 }, { "epoch": 0.618421052631579, "grad_norm": 0.08999975458012179, "learning_rate": 0.00019321465233166924, "loss": 0.2949, "step": 470 }, { "epoch": 0.6197368421052631, "grad_norm": 0.09506140617593234, "learning_rate": 0.0001931591088051279, "loss": 0.3151, "step": 471 }, { "epoch": 0.6210526315789474, "grad_norm": 0.09611958378062155, "learning_rate": 0.00019310334692000075, "loss": 0.2907, "step": 472 }, { "epoch": 0.6223684210526316, "grad_norm": 0.09078308388388881, "learning_rate": 0.00019304736680698988, "loss": 0.2969, "step": 473 }, { "epoch": 0.6236842105263158, "grad_norm": 0.09080240341624422, "learning_rate": 0.0001929911685973088, "loss": 0.2969, "step": 474 }, { "epoch": 0.625, "grad_norm": 0.08939149414674773, "learning_rate": 0.00019293475242268223, "loss": 0.2849, "step": 475 }, { "epoch": 0.6263157894736842, "grad_norm": 0.08769153124531658, "learning_rate": 0.00019287811841534595, "loss": 0.2857, "step": 476 }, { "epoch": 0.6276315789473684, "grad_norm": 0.09345365417023815, "learning_rate": 0.00019282126670804614, "loss": 0.3044, "step": 477 }, { "epoch": 0.6289473684210526, "grad_norm": 0.09137801449382149, "learning_rate": 0.00019276419743403933, "loss": 0.2993, "step": 478 }, { "epoch": 0.6302631578947369, "grad_norm": 0.090592310213129, "learning_rate": 0.00019270691072709195, "loss": 0.3005, "step": 479 }, { "epoch": 0.631578947368421, "grad_norm": 0.09174534803218672, "learning_rate": 0.00019264940672148018, "loss": 0.3059, "step": 480 }, { "epoch": 0.6328947368421053, "grad_norm": 0.0871835682901987, "learning_rate": 0.00019259168555198948, "loss": 0.286, "step": 481 }, { "epoch": 0.6342105263157894, "grad_norm": 0.0892410180390742, "learning_rate": 0.0001925337473539143, "loss": 0.2788, "step": 482 }, { "epoch": 0.6355263157894737, "grad_norm": 0.09248088501840468, "learning_rate": 0.00019247559226305785, "loss": 0.2948, "step": 483 }, { "epoch": 0.6368421052631579, "grad_norm": 0.09098010321941165, "learning_rate": 0.00019241722041573166, "loss": 0.3028, "step": 484 }, { "epoch": 0.6381578947368421, "grad_norm": 0.0896570234030572, "learning_rate": 0.00019235863194875532, "loss": 0.2832, "step": 485 }, { "epoch": 0.6394736842105263, "grad_norm": 0.08779926165652006, "learning_rate": 0.0001922998269994563, "loss": 0.2987, "step": 486 }, { "epoch": 0.6407894736842106, "grad_norm": 0.09365677051236368, "learning_rate": 0.00019224080570566927, "loss": 0.2912, "step": 487 }, { "epoch": 0.6421052631578947, "grad_norm": 0.08927856303791634, "learning_rate": 0.0001921815682057362, "loss": 0.2966, "step": 488 }, { "epoch": 0.6434210526315789, "grad_norm": 0.0879241771326983, "learning_rate": 0.00019212211463850567, "loss": 0.284, "step": 489 }, { "epoch": 0.6447368421052632, "grad_norm": 0.0878734974007894, "learning_rate": 0.00019206244514333282, "loss": 0.2834, "step": 490 }, { "epoch": 0.6460526315789473, "grad_norm": 0.08899613109416632, "learning_rate": 0.00019200255986007885, "loss": 0.2905, "step": 491 }, { "epoch": 0.6473684210526316, "grad_norm": 0.0868302957136876, "learning_rate": 0.0001919424589291108, "loss": 0.2994, "step": 492 }, { "epoch": 0.6486842105263158, "grad_norm": 0.08873110421371189, "learning_rate": 0.0001918821424913011, "loss": 0.3054, "step": 493 }, { "epoch": 0.65, "grad_norm": 0.08729340226342107, "learning_rate": 0.00019182161068802741, "loss": 0.2926, "step": 494 }, { "epoch": 0.6513157894736842, "grad_norm": 0.09075104025668673, "learning_rate": 0.00019176086366117211, "loss": 0.304, "step": 495 }, { "epoch": 0.6526315789473685, "grad_norm": 0.0893794254125347, "learning_rate": 0.0001916999015531221, "loss": 0.2917, "step": 496 }, { "epoch": 0.6539473684210526, "grad_norm": 0.089792074117038, "learning_rate": 0.00019163872450676835, "loss": 0.2812, "step": 497 }, { "epoch": 0.6552631578947369, "grad_norm": 0.09280308668273284, "learning_rate": 0.00019157733266550575, "loss": 0.3024, "step": 498 }, { "epoch": 0.656578947368421, "grad_norm": 0.0871128187479411, "learning_rate": 0.00019151572617323253, "loss": 0.29, "step": 499 }, { "epoch": 0.6578947368421053, "grad_norm": 0.08870708451193264, "learning_rate": 0.00019145390517435012, "loss": 0.3034, "step": 500 }, { "epoch": 0.6592105263157895, "grad_norm": 0.08720360293850979, "learning_rate": 0.00019139186981376267, "loss": 0.2844, "step": 501 }, { "epoch": 0.6605263157894737, "grad_norm": 0.08832729646265089, "learning_rate": 0.0001913296202368769, "loss": 0.2999, "step": 502 }, { "epoch": 0.6618421052631579, "grad_norm": 0.08817831223979289, "learning_rate": 0.0001912671565896015, "loss": 0.3034, "step": 503 }, { "epoch": 0.6631578947368421, "grad_norm": 0.08919226453530668, "learning_rate": 0.00019120447901834706, "loss": 0.2865, "step": 504 }, { "epoch": 0.6644736842105263, "grad_norm": 0.09256732471786443, "learning_rate": 0.00019114158767002547, "loss": 0.2887, "step": 505 }, { "epoch": 0.6657894736842105, "grad_norm": 0.08807553839373092, "learning_rate": 0.00019107848269204976, "loss": 0.2864, "step": 506 }, { "epoch": 0.6671052631578948, "grad_norm": 0.08590339271381157, "learning_rate": 0.00019101516423233368, "loss": 0.2878, "step": 507 }, { "epoch": 0.6684210526315789, "grad_norm": 0.08651840397545302, "learning_rate": 0.00019095163243929142, "loss": 0.3005, "step": 508 }, { "epoch": 0.6697368421052632, "grad_norm": 0.08955034533153189, "learning_rate": 0.00019088788746183714, "loss": 0.2906, "step": 509 }, { "epoch": 0.6710526315789473, "grad_norm": 0.08824857512769824, "learning_rate": 0.00019082392944938466, "loss": 0.2826, "step": 510 }, { "epoch": 0.6723684210526316, "grad_norm": 0.08981332868291347, "learning_rate": 0.00019075975855184724, "loss": 0.2791, "step": 511 }, { "epoch": 0.6736842105263158, "grad_norm": 0.08872229343602653, "learning_rate": 0.0001906953749196371, "loss": 0.2941, "step": 512 }, { "epoch": 0.675, "grad_norm": 0.08604148465349523, "learning_rate": 0.000190630778703665, "loss": 0.2898, "step": 513 }, { "epoch": 0.6763157894736842, "grad_norm": 0.09068748827778476, "learning_rate": 0.00019056597005534013, "loss": 0.3052, "step": 514 }, { "epoch": 0.6776315789473685, "grad_norm": 0.09179119755745117, "learning_rate": 0.00019050094912656952, "loss": 0.3034, "step": 515 }, { "epoch": 0.6789473684210526, "grad_norm": 0.08427846310926745, "learning_rate": 0.00019043571606975777, "loss": 0.2948, "step": 516 }, { "epoch": 0.6802631578947368, "grad_norm": 0.08476965472606288, "learning_rate": 0.00019037027103780668, "loss": 0.2883, "step": 517 }, { "epoch": 0.6815789473684211, "grad_norm": 0.08421580836098888, "learning_rate": 0.00019030461418411497, "loss": 0.2775, "step": 518 }, { "epoch": 0.6828947368421052, "grad_norm": 0.08618030098765003, "learning_rate": 0.00019023874566257784, "loss": 0.2771, "step": 519 }, { "epoch": 0.6842105263157895, "grad_norm": 0.08571261541013994, "learning_rate": 0.00019017266562758659, "loss": 0.289, "step": 520 }, { "epoch": 0.6855263157894737, "grad_norm": 0.09462804464406739, "learning_rate": 0.00019010637423402823, "loss": 0.31, "step": 521 }, { "epoch": 0.6868421052631579, "grad_norm": 0.09220006978220673, "learning_rate": 0.00019003987163728535, "loss": 0.2898, "step": 522 }, { "epoch": 0.6881578947368421, "grad_norm": 0.08687665486527996, "learning_rate": 0.00018997315799323548, "loss": 0.2914, "step": 523 }, { "epoch": 0.6894736842105263, "grad_norm": 0.08629627380926978, "learning_rate": 0.00018990623345825083, "loss": 0.2875, "step": 524 }, { "epoch": 0.6907894736842105, "grad_norm": 0.08990734481501246, "learning_rate": 0.0001898390981891979, "loss": 0.2935, "step": 525 }, { "epoch": 0.6921052631578948, "grad_norm": 0.09107845137785629, "learning_rate": 0.00018977175234343723, "loss": 0.293, "step": 526 }, { "epoch": 0.6934210526315789, "grad_norm": 0.08746893451432518, "learning_rate": 0.00018970419607882284, "loss": 0.2834, "step": 527 }, { "epoch": 0.6947368421052632, "grad_norm": 0.08510400660112896, "learning_rate": 0.00018963642955370201, "loss": 0.2836, "step": 528 }, { "epoch": 0.6960526315789474, "grad_norm": 0.08902379266326126, "learning_rate": 0.00018956845292691487, "loss": 0.2918, "step": 529 }, { "epoch": 0.6973684210526315, "grad_norm": 0.08716702459764675, "learning_rate": 0.00018950026635779397, "loss": 0.3018, "step": 530 }, { "epoch": 0.6986842105263158, "grad_norm": 0.0860349723743795, "learning_rate": 0.00018943187000616395, "loss": 0.2862, "step": 531 }, { "epoch": 0.7, "grad_norm": 0.08640233925578623, "learning_rate": 0.00018936326403234125, "loss": 0.281, "step": 532 }, { "epoch": 0.7013157894736842, "grad_norm": 0.08958858699111341, "learning_rate": 0.0001892944485971335, "loss": 0.2835, "step": 533 }, { "epoch": 0.7026315789473684, "grad_norm": 0.09155021097028584, "learning_rate": 0.0001892254238618394, "loss": 0.2871, "step": 534 }, { "epoch": 0.7039473684210527, "grad_norm": 0.08704346353009886, "learning_rate": 0.00018915618998824825, "loss": 0.2773, "step": 535 }, { "epoch": 0.7052631578947368, "grad_norm": 0.09429002348494249, "learning_rate": 0.00018908674713863952, "loss": 0.2995, "step": 536 }, { "epoch": 0.7065789473684211, "grad_norm": 0.0872284448725193, "learning_rate": 0.00018901709547578245, "loss": 0.29, "step": 537 }, { "epoch": 0.7078947368421052, "grad_norm": 0.08323277604666501, "learning_rate": 0.00018894723516293583, "loss": 0.2924, "step": 538 }, { "epoch": 0.7092105263157895, "grad_norm": 0.09209955844526282, "learning_rate": 0.00018887716636384745, "loss": 0.3093, "step": 539 }, { "epoch": 0.7105263157894737, "grad_norm": 0.0928458401326009, "learning_rate": 0.00018880688924275378, "loss": 0.2898, "step": 540 }, { "epoch": 0.7118421052631579, "grad_norm": 0.08447656093725353, "learning_rate": 0.00018873640396437958, "loss": 0.2998, "step": 541 }, { "epoch": 0.7131578947368421, "grad_norm": 0.08914403154111118, "learning_rate": 0.00018866571069393753, "loss": 0.2975, "step": 542 }, { "epoch": 0.7144736842105263, "grad_norm": 0.08061793607147465, "learning_rate": 0.0001885948095971278, "loss": 0.2693, "step": 543 }, { "epoch": 0.7157894736842105, "grad_norm": 0.08081926207565744, "learning_rate": 0.0001885237008401378, "loss": 0.2833, "step": 544 }, { "epoch": 0.7171052631578947, "grad_norm": 0.08814474168945655, "learning_rate": 0.00018845238458964155, "loss": 0.2867, "step": 545 }, { "epoch": 0.718421052631579, "grad_norm": 0.08581179650874697, "learning_rate": 0.00018838086101279945, "loss": 0.2966, "step": 546 }, { "epoch": 0.7197368421052631, "grad_norm": 0.08824691850575504, "learning_rate": 0.0001883091302772579, "loss": 0.2855, "step": 547 }, { "epoch": 0.7210526315789474, "grad_norm": 0.08188688143306905, "learning_rate": 0.0001882371925511488, "loss": 0.2913, "step": 548 }, { "epoch": 0.7223684210526315, "grad_norm": 0.08584104136037612, "learning_rate": 0.00018816504800308934, "loss": 0.2907, "step": 549 }, { "epoch": 0.7236842105263158, "grad_norm": 0.08410484967989623, "learning_rate": 0.00018809269680218136, "loss": 0.2745, "step": 550 }, { "epoch": 0.725, "grad_norm": 0.08587157095386123, "learning_rate": 0.00018802013911801112, "loss": 0.2881, "step": 551 }, { "epoch": 0.7263157894736842, "grad_norm": 0.09288984345534493, "learning_rate": 0.0001879473751206489, "loss": 0.3067, "step": 552 }, { "epoch": 0.7276315789473684, "grad_norm": 0.08697284865649188, "learning_rate": 0.00018787440498064856, "loss": 0.2919, "step": 553 }, { "epoch": 0.7289473684210527, "grad_norm": 0.08793299552682735, "learning_rate": 0.00018780122886904709, "loss": 0.3029, "step": 554 }, { "epoch": 0.7302631578947368, "grad_norm": 0.08461397738242368, "learning_rate": 0.0001877278469573643, "loss": 0.2977, "step": 555 }, { "epoch": 0.7315789473684211, "grad_norm": 0.0874714643672974, "learning_rate": 0.00018765425941760238, "loss": 0.2928, "step": 556 }, { "epoch": 0.7328947368421053, "grad_norm": 0.0873090646029582, "learning_rate": 0.0001875804664222455, "loss": 0.3066, "step": 557 }, { "epoch": 0.7342105263157894, "grad_norm": 0.08379654737914938, "learning_rate": 0.00018750646814425938, "loss": 0.2977, "step": 558 }, { "epoch": 0.7355263157894737, "grad_norm": 0.09314788862619253, "learning_rate": 0.00018743226475709094, "loss": 0.2963, "step": 559 }, { "epoch": 0.7368421052631579, "grad_norm": 0.08588334887643348, "learning_rate": 0.00018735785643466784, "loss": 0.2878, "step": 560 }, { "epoch": 0.7381578947368421, "grad_norm": 0.08163329639613695, "learning_rate": 0.00018728324335139814, "loss": 0.2991, "step": 561 }, { "epoch": 0.7394736842105263, "grad_norm": 0.08439090485689141, "learning_rate": 0.00018720842568216978, "loss": 0.274, "step": 562 }, { "epoch": 0.7407894736842106, "grad_norm": 0.09031678503562862, "learning_rate": 0.0001871334036023503, "loss": 0.2942, "step": 563 }, { "epoch": 0.7421052631578947, "grad_norm": 0.0812973217567514, "learning_rate": 0.00018705817728778624, "loss": 0.2725, "step": 564 }, { "epoch": 0.743421052631579, "grad_norm": 0.08201157379247533, "learning_rate": 0.00018698274691480302, "loss": 0.2712, "step": 565 }, { "epoch": 0.7447368421052631, "grad_norm": 0.08733882187748677, "learning_rate": 0.00018690711266020426, "loss": 0.2841, "step": 566 }, { "epoch": 0.7460526315789474, "grad_norm": 0.08741424395823033, "learning_rate": 0.0001868312747012715, "loss": 0.2763, "step": 567 }, { "epoch": 0.7473684210526316, "grad_norm": 0.08660511543001677, "learning_rate": 0.00018675523321576371, "loss": 0.2847, "step": 568 }, { "epoch": 0.7486842105263158, "grad_norm": 0.08668938841950703, "learning_rate": 0.00018667898838191694, "loss": 0.2821, "step": 569 }, { "epoch": 0.75, "grad_norm": 0.08397772374383645, "learning_rate": 0.00018660254037844388, "loss": 0.2919, "step": 570 }, { "epoch": 0.7513157894736842, "grad_norm": 0.0890009207591882, "learning_rate": 0.0001865258893845334, "loss": 0.2935, "step": 571 }, { "epoch": 0.7526315789473684, "grad_norm": 0.08615071748725678, "learning_rate": 0.00018644903557985025, "loss": 0.2862, "step": 572 }, { "epoch": 0.7539473684210526, "grad_norm": 0.08035895214007202, "learning_rate": 0.00018637197914453445, "loss": 0.2915, "step": 573 }, { "epoch": 0.7552631578947369, "grad_norm": 0.08394144603894776, "learning_rate": 0.000186294720259201, "loss": 0.2905, "step": 574 }, { "epoch": 0.756578947368421, "grad_norm": 0.08681582101826608, "learning_rate": 0.0001862172591049395, "loss": 0.2927, "step": 575 }, { "epoch": 0.7578947368421053, "grad_norm": 0.08027258031476615, "learning_rate": 0.00018613959586331362, "loss": 0.2717, "step": 576 }, { "epoch": 0.7592105263157894, "grad_norm": 0.0832571328383153, "learning_rate": 0.0001860617307163606, "loss": 0.2877, "step": 577 }, { "epoch": 0.7605263157894737, "grad_norm": 0.0855055635944409, "learning_rate": 0.0001859836638465911, "loss": 0.2879, "step": 578 }, { "epoch": 0.7618421052631579, "grad_norm": 0.08455171659738935, "learning_rate": 0.00018590539543698854, "loss": 0.2946, "step": 579 }, { "epoch": 0.7631578947368421, "grad_norm": 0.08740787324026139, "learning_rate": 0.00018582692567100867, "loss": 0.2835, "step": 580 }, { "epoch": 0.7644736842105263, "grad_norm": 0.08908515189955798, "learning_rate": 0.00018574825473257925, "loss": 0.2917, "step": 581 }, { "epoch": 0.7657894736842106, "grad_norm": 0.0856923697970573, "learning_rate": 0.00018566938280609966, "loss": 0.2794, "step": 582 }, { "epoch": 0.7671052631578947, "grad_norm": 0.08400580488406503, "learning_rate": 0.00018559031007644024, "loss": 0.2764, "step": 583 }, { "epoch": 0.7684210526315789, "grad_norm": 0.08599213131358657, "learning_rate": 0.00018551103672894206, "loss": 0.2902, "step": 584 }, { "epoch": 0.7697368421052632, "grad_norm": 0.08612768497888425, "learning_rate": 0.0001854315629494165, "loss": 0.2913, "step": 585 }, { "epoch": 0.7710526315789473, "grad_norm": 0.08848005818883788, "learning_rate": 0.0001853518889241446, "loss": 0.2913, "step": 586 }, { "epoch": 0.7723684210526316, "grad_norm": 0.08443059614838172, "learning_rate": 0.0001852720148398769, "loss": 0.283, "step": 587 }, { "epoch": 0.7736842105263158, "grad_norm": 0.08864582533090447, "learning_rate": 0.00018519194088383273, "loss": 0.2888, "step": 588 }, { "epoch": 0.775, "grad_norm": 0.08840947193889603, "learning_rate": 0.00018511166724369997, "loss": 0.2931, "step": 589 }, { "epoch": 0.7763157894736842, "grad_norm": 0.08363366619678043, "learning_rate": 0.0001850311941076346, "loss": 0.2782, "step": 590 }, { "epoch": 0.7776315789473685, "grad_norm": 0.08538148462925309, "learning_rate": 0.00018495052166426015, "loss": 0.2856, "step": 591 }, { "epoch": 0.7789473684210526, "grad_norm": 0.08343535715617335, "learning_rate": 0.00018486965010266725, "loss": 0.2847, "step": 592 }, { "epoch": 0.7802631578947369, "grad_norm": 0.08230524033132103, "learning_rate": 0.00018478857961241337, "loss": 0.2878, "step": 593 }, { "epoch": 0.781578947368421, "grad_norm": 0.08479401897743853, "learning_rate": 0.0001847073103835222, "loss": 0.2873, "step": 594 }, { "epoch": 0.7828947368421053, "grad_norm": 0.08524719297922047, "learning_rate": 0.00018462584260648323, "loss": 0.2845, "step": 595 }, { "epoch": 0.7842105263157895, "grad_norm": 0.08448036993692473, "learning_rate": 0.0001845441764722514, "loss": 0.288, "step": 596 }, { "epoch": 0.7855263157894737, "grad_norm": 0.08740687637602888, "learning_rate": 0.0001844623121722465, "loss": 0.2753, "step": 597 }, { "epoch": 0.7868421052631579, "grad_norm": 0.08303399012001453, "learning_rate": 0.0001843802498983529, "loss": 0.2875, "step": 598 }, { "epoch": 0.7881578947368421, "grad_norm": 0.08523557992775858, "learning_rate": 0.00018429798984291896, "loss": 0.2843, "step": 599 }, { "epoch": 0.7894736842105263, "grad_norm": 0.0882338916613198, "learning_rate": 0.00018421553219875658, "loss": 0.3044, "step": 600 }, { "epoch": 0.7907894736842105, "grad_norm": 0.08386004069859475, "learning_rate": 0.00018413287715914089, "loss": 0.2744, "step": 601 }, { "epoch": 0.7921052631578948, "grad_norm": 0.0860906320981426, "learning_rate": 0.00018405002491780968, "loss": 0.2904, "step": 602 }, { "epoch": 0.7934210526315789, "grad_norm": 0.08412547381525294, "learning_rate": 0.00018396697566896286, "loss": 0.2944, "step": 603 }, { "epoch": 0.7947368421052632, "grad_norm": 0.08160487708739002, "learning_rate": 0.00018388372960726228, "loss": 0.2894, "step": 604 }, { "epoch": 0.7960526315789473, "grad_norm": 0.07789675625414566, "learning_rate": 0.00018380028692783096, "loss": 0.289, "step": 605 }, { "epoch": 0.7973684210526316, "grad_norm": 0.08326388068355157, "learning_rate": 0.00018371664782625287, "loss": 0.2757, "step": 606 }, { "epoch": 0.7986842105263158, "grad_norm": 0.0813892959155161, "learning_rate": 0.00018363281249857233, "loss": 0.2805, "step": 607 }, { "epoch": 0.8, "grad_norm": 0.08338896150212435, "learning_rate": 0.00018354878114129367, "loss": 0.2761, "step": 608 }, { "epoch": 0.8013157894736842, "grad_norm": 0.08472324714205545, "learning_rate": 0.00018346455395138058, "loss": 0.2814, "step": 609 }, { "epoch": 0.8026315789473685, "grad_norm": 0.08513461063729397, "learning_rate": 0.00018338013112625587, "loss": 0.2952, "step": 610 }, { "epoch": 0.8039473684210526, "grad_norm": 0.08100863304109138, "learning_rate": 0.00018329551286380087, "loss": 0.2756, "step": 611 }, { "epoch": 0.8052631578947368, "grad_norm": 0.0817298921819726, "learning_rate": 0.00018321069936235503, "loss": 0.2788, "step": 612 }, { "epoch": 0.8065789473684211, "grad_norm": 0.08139556434473312, "learning_rate": 0.00018312569082071535, "loss": 0.2855, "step": 613 }, { "epoch": 0.8078947368421052, "grad_norm": 0.08426481750684653, "learning_rate": 0.0001830404874381361, "loss": 0.2932, "step": 614 }, { "epoch": 0.8092105263157895, "grad_norm": 0.08338650300983878, "learning_rate": 0.00018295508941432815, "loss": 0.291, "step": 615 }, { "epoch": 0.8105263157894737, "grad_norm": 0.0841355163206272, "learning_rate": 0.00018286949694945866, "loss": 0.2905, "step": 616 }, { "epoch": 0.8118421052631579, "grad_norm": 0.08411824810347067, "learning_rate": 0.0001827837102441505, "loss": 0.2992, "step": 617 }, { "epoch": 0.8131578947368421, "grad_norm": 0.08090022262513084, "learning_rate": 0.00018269772949948182, "loss": 0.2906, "step": 618 }, { "epoch": 0.8144736842105263, "grad_norm": 0.0788680440518934, "learning_rate": 0.00018261155491698568, "loss": 0.2939, "step": 619 }, { "epoch": 0.8157894736842105, "grad_norm": 0.07833398942854092, "learning_rate": 0.00018252518669864936, "loss": 0.2873, "step": 620 }, { "epoch": 0.8171052631578948, "grad_norm": 0.08646064241825135, "learning_rate": 0.00018243862504691407, "loss": 0.2831, "step": 621 }, { "epoch": 0.8184210526315789, "grad_norm": 0.07939171423489742, "learning_rate": 0.00018235187016467442, "loss": 0.2831, "step": 622 }, { "epoch": 0.8197368421052632, "grad_norm": 0.08346477115007297, "learning_rate": 0.0001822649222552779, "loss": 0.2832, "step": 623 }, { "epoch": 0.8210526315789474, "grad_norm": 0.08777211185850274, "learning_rate": 0.0001821777815225245, "loss": 0.2946, "step": 624 }, { "epoch": 0.8223684210526315, "grad_norm": 0.08176020142698397, "learning_rate": 0.00018209044817066617, "loss": 0.2808, "step": 625 }, { "epoch": 0.8236842105263158, "grad_norm": 0.083353664452084, "learning_rate": 0.00018200292240440623, "loss": 0.2964, "step": 626 }, { "epoch": 0.825, "grad_norm": 0.08320257861919898, "learning_rate": 0.0001819152044288992, "loss": 0.2687, "step": 627 }, { "epoch": 0.8263157894736842, "grad_norm": 0.08315187924191537, "learning_rate": 0.00018182729444974992, "loss": 0.2911, "step": 628 }, { "epoch": 0.8276315789473684, "grad_norm": 0.08465028554778517, "learning_rate": 0.00018173919267301344, "loss": 0.2849, "step": 629 }, { "epoch": 0.8289473684210527, "grad_norm": 0.0808599203291463, "learning_rate": 0.0001816508993051943, "loss": 0.2842, "step": 630 }, { "epoch": 0.8302631578947368, "grad_norm": 0.08319880407928407, "learning_rate": 0.0001815624145532461, "loss": 0.2878, "step": 631 }, { "epoch": 0.8315789473684211, "grad_norm": 0.08454873694804513, "learning_rate": 0.00018147373862457107, "loss": 0.2977, "step": 632 }, { "epoch": 0.8328947368421052, "grad_norm": 0.08522249341627423, "learning_rate": 0.0001813848717270195, "loss": 0.2962, "step": 633 }, { "epoch": 0.8342105263157895, "grad_norm": 0.08804872368012215, "learning_rate": 0.00018129581406888936, "loss": 0.283, "step": 634 }, { "epoch": 0.8355263157894737, "grad_norm": 0.08019331668231523, "learning_rate": 0.00018120656585892572, "loss": 0.2856, "step": 635 }, { "epoch": 0.8368421052631579, "grad_norm": 0.08839395034159263, "learning_rate": 0.00018111712730632022, "loss": 0.3064, "step": 636 }, { "epoch": 0.8381578947368421, "grad_norm": 0.08267930670342614, "learning_rate": 0.00018102749862071083, "loss": 0.2884, "step": 637 }, { "epoch": 0.8394736842105263, "grad_norm": 0.08334284753944939, "learning_rate": 0.00018093768001218094, "loss": 0.2882, "step": 638 }, { "epoch": 0.8407894736842105, "grad_norm": 0.08293903985279776, "learning_rate": 0.00018084767169125932, "loss": 0.2855, "step": 639 }, { "epoch": 0.8421052631578947, "grad_norm": 0.08224010769882795, "learning_rate": 0.0001807574738689193, "loss": 0.3051, "step": 640 }, { "epoch": 0.843421052631579, "grad_norm": 0.07771382955899404, "learning_rate": 0.00018066708675657837, "loss": 0.255, "step": 641 }, { "epoch": 0.8447368421052631, "grad_norm": 0.08276207696773076, "learning_rate": 0.00018057651056609784, "loss": 0.2851, "step": 642 }, { "epoch": 0.8460526315789474, "grad_norm": 0.08138572227367649, "learning_rate": 0.000180485745509782, "loss": 0.2726, "step": 643 }, { "epoch": 0.8473684210526315, "grad_norm": 0.07991337682516308, "learning_rate": 0.000180394791800378, "loss": 0.2795, "step": 644 }, { "epoch": 0.8486842105263158, "grad_norm": 0.08368290547552978, "learning_rate": 0.0001803036496510752, "loss": 0.2876, "step": 645 }, { "epoch": 0.85, "grad_norm": 0.08602046286102522, "learning_rate": 0.0001802123192755044, "loss": 0.2899, "step": 646 }, { "epoch": 0.8513157894736842, "grad_norm": 0.08247602682736774, "learning_rate": 0.00018012080088773786, "loss": 0.289, "step": 647 }, { "epoch": 0.8526315789473684, "grad_norm": 0.08076494362770659, "learning_rate": 0.00018002909470228842, "loss": 0.2953, "step": 648 }, { "epoch": 0.8539473684210527, "grad_norm": 0.08213269492727011, "learning_rate": 0.0001799372009341091, "loss": 0.2834, "step": 649 }, { "epoch": 0.8552631578947368, "grad_norm": 0.07906596825371759, "learning_rate": 0.00017984511979859263, "loss": 0.2884, "step": 650 }, { "epoch": 0.8565789473684211, "grad_norm": 0.08280102631376626, "learning_rate": 0.0001797528515115709, "loss": 0.2898, "step": 651 }, { "epoch": 0.8578947368421053, "grad_norm": 0.08195845362740756, "learning_rate": 0.00017966039628931446, "loss": 0.2811, "step": 652 }, { "epoch": 0.8592105263157894, "grad_norm": 0.08275884521115351, "learning_rate": 0.00017956775434853201, "loss": 0.2809, "step": 653 }, { "epoch": 0.8605263157894737, "grad_norm": 0.0823561522762983, "learning_rate": 0.00017947492590637, "loss": 0.2733, "step": 654 }, { "epoch": 0.8618421052631579, "grad_norm": 0.08408575804402002, "learning_rate": 0.00017938191118041185, "loss": 0.2921, "step": 655 }, { "epoch": 0.8631578947368421, "grad_norm": 0.08263210898936545, "learning_rate": 0.00017928871038867784, "loss": 0.2896, "step": 656 }, { "epoch": 0.8644736842105263, "grad_norm": 0.08282963133814529, "learning_rate": 0.00017919532374962416, "loss": 0.267, "step": 657 }, { "epoch": 0.8657894736842106, "grad_norm": 0.08456674605493757, "learning_rate": 0.00017910175148214274, "loss": 0.2846, "step": 658 }, { "epoch": 0.8671052631578947, "grad_norm": 0.08295032078815928, "learning_rate": 0.00017900799380556065, "loss": 0.2957, "step": 659 }, { "epoch": 0.868421052631579, "grad_norm": 0.08407937334549097, "learning_rate": 0.00017891405093963938, "loss": 0.2801, "step": 660 }, { "epoch": 0.8697368421052631, "grad_norm": 0.08152847255098901, "learning_rate": 0.00017881992310457461, "loss": 0.287, "step": 661 }, { "epoch": 0.8710526315789474, "grad_norm": 0.07846383233153922, "learning_rate": 0.00017872561052099562, "loss": 0.2808, "step": 662 }, { "epoch": 0.8723684210526316, "grad_norm": 0.08455839714844739, "learning_rate": 0.00017863111340996458, "loss": 0.2887, "step": 663 }, { "epoch": 0.8736842105263158, "grad_norm": 0.08214963388362358, "learning_rate": 0.00017853643199297633, "loss": 0.2826, "step": 664 }, { "epoch": 0.875, "grad_norm": 0.08555672825696439, "learning_rate": 0.00017844156649195759, "loss": 0.2933, "step": 665 }, { "epoch": 0.8763157894736842, "grad_norm": 0.08460096862590989, "learning_rate": 0.00017834651712926662, "loss": 0.2826, "step": 666 }, { "epoch": 0.8776315789473684, "grad_norm": 0.08252349078451936, "learning_rate": 0.00017825128412769266, "loss": 0.2958, "step": 667 }, { "epoch": 0.8789473684210526, "grad_norm": 0.08712916060067398, "learning_rate": 0.00017815586771045535, "loss": 0.3058, "step": 668 }, { "epoch": 0.8802631578947369, "grad_norm": 0.08180591687568112, "learning_rate": 0.00017806026810120423, "loss": 0.2895, "step": 669 }, { "epoch": 0.881578947368421, "grad_norm": 0.07876973529889683, "learning_rate": 0.00017796448552401825, "loss": 0.286, "step": 670 }, { "epoch": 0.8828947368421053, "grad_norm": 0.07992980593985875, "learning_rate": 0.00017786852020340525, "loss": 0.2861, "step": 671 }, { "epoch": 0.8842105263157894, "grad_norm": 0.08596031959873009, "learning_rate": 0.0001777723723643014, "loss": 0.2956, "step": 672 }, { "epoch": 0.8855263157894737, "grad_norm": 0.08250501195390463, "learning_rate": 0.00017767604223207064, "loss": 0.2925, "step": 673 }, { "epoch": 0.8868421052631579, "grad_norm": 0.08065929962541825, "learning_rate": 0.00017757953003250422, "loss": 0.2751, "step": 674 }, { "epoch": 0.8881578947368421, "grad_norm": 0.079685175906907, "learning_rate": 0.00017748283599182014, "loss": 0.2713, "step": 675 }, { "epoch": 0.8894736842105263, "grad_norm": 0.08275584920700671, "learning_rate": 0.0001773859603366626, "loss": 0.2805, "step": 676 }, { "epoch": 0.8907894736842106, "grad_norm": 0.08237909937314071, "learning_rate": 0.00017728890329410157, "loss": 0.2769, "step": 677 }, { "epoch": 0.8921052631578947, "grad_norm": 0.08027830306430328, "learning_rate": 0.0001771916650916321, "loss": 0.2815, "step": 678 }, { "epoch": 0.8934210526315789, "grad_norm": 0.07965207648580828, "learning_rate": 0.00017709424595717388, "loss": 0.2767, "step": 679 }, { "epoch": 0.8947368421052632, "grad_norm": 0.08200170949634054, "learning_rate": 0.00017699664611907072, "loss": 0.2671, "step": 680 }, { "epoch": 0.8960526315789473, "grad_norm": 0.0837945547867222, "learning_rate": 0.00017689886580608998, "loss": 0.2846, "step": 681 }, { "epoch": 0.8973684210526316, "grad_norm": 0.08287411520644314, "learning_rate": 0.00017680090524742204, "loss": 0.2797, "step": 682 }, { "epoch": 0.8986842105263158, "grad_norm": 0.08346778301448879, "learning_rate": 0.0001767027646726797, "loss": 0.2851, "step": 683 }, { "epoch": 0.9, "grad_norm": 0.08078331578360252, "learning_rate": 0.0001766044443118978, "loss": 0.2758, "step": 684 }, { "epoch": 0.9013157894736842, "grad_norm": 0.0823050633372224, "learning_rate": 0.0001765059443955326, "loss": 0.2896, "step": 685 }, { "epoch": 0.9026315789473685, "grad_norm": 0.08085978291636725, "learning_rate": 0.00017640726515446103, "loss": 0.2752, "step": 686 }, { "epoch": 0.9039473684210526, "grad_norm": 0.08633725156118303, "learning_rate": 0.00017630840681998066, "loss": 0.2885, "step": 687 }, { "epoch": 0.9052631578947369, "grad_norm": 0.08134705054643286, "learning_rate": 0.00017620936962380856, "loss": 0.2741, "step": 688 }, { "epoch": 0.906578947368421, "grad_norm": 0.08218959378945977, "learning_rate": 0.0001761101537980812, "loss": 0.2932, "step": 689 }, { "epoch": 0.9078947368421053, "grad_norm": 0.08050157591340845, "learning_rate": 0.00017601075957535364, "loss": 0.2801, "step": 690 }, { "epoch": 0.9092105263157895, "grad_norm": 0.07842237436882951, "learning_rate": 0.00017591118718859923, "loss": 0.2672, "step": 691 }, { "epoch": 0.9105263157894737, "grad_norm": 0.08141690192884726, "learning_rate": 0.00017581143687120875, "loss": 0.2875, "step": 692 }, { "epoch": 0.9118421052631579, "grad_norm": 0.0841178887567793, "learning_rate": 0.00017571150885699023, "loss": 0.288, "step": 693 }, { "epoch": 0.9131578947368421, "grad_norm": 0.08199498281628458, "learning_rate": 0.00017561140338016802, "loss": 0.2908, "step": 694 }, { "epoch": 0.9144736842105263, "grad_norm": 0.07861915918731534, "learning_rate": 0.00017551112067538255, "loss": 0.2714, "step": 695 }, { "epoch": 0.9157894736842105, "grad_norm": 0.0831092654603713, "learning_rate": 0.00017541066097768963, "loss": 0.2829, "step": 696 }, { "epoch": 0.9171052631578948, "grad_norm": 0.08057002540174062, "learning_rate": 0.00017531002452255993, "loss": 0.2938, "step": 697 }, { "epoch": 0.9184210526315789, "grad_norm": 0.08109895602300159, "learning_rate": 0.00017520921154587843, "loss": 0.2913, "step": 698 }, { "epoch": 0.9197368421052632, "grad_norm": 0.08102699248656946, "learning_rate": 0.00017510822228394385, "loss": 0.2725, "step": 699 }, { "epoch": 0.9210526315789473, "grad_norm": 0.07787423365882763, "learning_rate": 0.0001750070569734681, "loss": 0.2741, "step": 700 }, { "epoch": 0.9223684210526316, "grad_norm": 0.07824132469372493, "learning_rate": 0.00017490571585157576, "loss": 0.2723, "step": 701 }, { "epoch": 0.9236842105263158, "grad_norm": 0.08382351085436456, "learning_rate": 0.00017480419915580356, "loss": 0.284, "step": 702 }, { "epoch": 0.925, "grad_norm": 0.0807625674700622, "learning_rate": 0.0001747025071240996, "loss": 0.2749, "step": 703 }, { "epoch": 0.9263157894736842, "grad_norm": 0.0816515881967252, "learning_rate": 0.00017460063999482316, "loss": 0.2789, "step": 704 }, { "epoch": 0.9276315789473685, "grad_norm": 0.07777785520204356, "learning_rate": 0.00017449859800674371, "loss": 0.2713, "step": 705 }, { "epoch": 0.9289473684210526, "grad_norm": 0.08076226646853048, "learning_rate": 0.0001743963813990408, "loss": 0.2953, "step": 706 }, { "epoch": 0.9302631578947368, "grad_norm": 0.07973740349278101, "learning_rate": 0.00017429399041130313, "loss": 0.2662, "step": 707 }, { "epoch": 0.9315789473684211, "grad_norm": 0.08456169813288093, "learning_rate": 0.00017419142528352817, "loss": 0.2859, "step": 708 }, { "epoch": 0.9328947368421052, "grad_norm": 0.08035481674307465, "learning_rate": 0.0001740886862561216, "loss": 0.2809, "step": 709 }, { "epoch": 0.9342105263157895, "grad_norm": 0.08259040712172755, "learning_rate": 0.00017398577356989665, "loss": 0.3024, "step": 710 }, { "epoch": 0.9355263157894737, "grad_norm": 0.08153299418934971, "learning_rate": 0.0001738826874660737, "loss": 0.2871, "step": 711 }, { "epoch": 0.9368421052631579, "grad_norm": 0.08013584006531711, "learning_rate": 0.00017377942818627942, "loss": 0.279, "step": 712 }, { "epoch": 0.9381578947368421, "grad_norm": 0.08047700820955582, "learning_rate": 0.00017367599597254655, "loss": 0.2801, "step": 713 }, { "epoch": 0.9394736842105263, "grad_norm": 0.07885052618166923, "learning_rate": 0.00017357239106731317, "loss": 0.2899, "step": 714 }, { "epoch": 0.9407894736842105, "grad_norm": 0.07929961550190592, "learning_rate": 0.0001734686137134221, "loss": 0.2796, "step": 715 }, { "epoch": 0.9421052631578948, "grad_norm": 0.08339513330893553, "learning_rate": 0.00017336466415412028, "loss": 0.2742, "step": 716 }, { "epoch": 0.9434210526315789, "grad_norm": 0.07669794587650931, "learning_rate": 0.00017326054263305847, "loss": 0.2718, "step": 717 }, { "epoch": 0.9447368421052632, "grad_norm": 0.0799990017084231, "learning_rate": 0.00017315624939429037, "loss": 0.287, "step": 718 }, { "epoch": 0.9460526315789474, "grad_norm": 0.07817929739263474, "learning_rate": 0.0001730517846822722, "loss": 0.2883, "step": 719 }, { "epoch": 0.9473684210526315, "grad_norm": 0.0795663214179471, "learning_rate": 0.0001729471487418621, "loss": 0.2693, "step": 720 }, { "epoch": 0.9486842105263158, "grad_norm": 0.0837605598262032, "learning_rate": 0.00017284234181831956, "loss": 0.2771, "step": 721 }, { "epoch": 0.95, "grad_norm": 0.08193295646472974, "learning_rate": 0.00017273736415730488, "loss": 0.3021, "step": 722 }, { "epoch": 0.9513157894736842, "grad_norm": 0.08213211517441393, "learning_rate": 0.00017263221600487852, "loss": 0.2696, "step": 723 }, { "epoch": 0.9526315789473684, "grad_norm": 0.08608126209062855, "learning_rate": 0.0001725268976075005, "loss": 0.2891, "step": 724 }, { "epoch": 0.9539473684210527, "grad_norm": 0.07916611709429071, "learning_rate": 0.00017242140921203003, "loss": 0.2833, "step": 725 }, { "epoch": 0.9552631578947368, "grad_norm": 0.08221381743802868, "learning_rate": 0.00017231575106572467, "loss": 0.2953, "step": 726 }, { "epoch": 0.9565789473684211, "grad_norm": 0.07838487546103713, "learning_rate": 0.0001722099234162399, "loss": 0.2854, "step": 727 }, { "epoch": 0.9578947368421052, "grad_norm": 0.08166198075045744, "learning_rate": 0.0001721039265116285, "loss": 0.2818, "step": 728 }, { "epoch": 0.9592105263157895, "grad_norm": 0.08220017081684095, "learning_rate": 0.00017199776060033997, "loss": 0.2761, "step": 729 }, { "epoch": 0.9605263157894737, "grad_norm": 0.08134420494867958, "learning_rate": 0.00017189142593121993, "loss": 0.2738, "step": 730 }, { "epoch": 0.9618421052631579, "grad_norm": 0.08157316013668467, "learning_rate": 0.00017178492275350958, "loss": 0.2637, "step": 731 }, { "epoch": 0.9631578947368421, "grad_norm": 0.0850556774542164, "learning_rate": 0.00017167825131684513, "loss": 0.292, "step": 732 }, { "epoch": 0.9644736842105263, "grad_norm": 0.08012953024736491, "learning_rate": 0.00017157141187125713, "loss": 0.2723, "step": 733 }, { "epoch": 0.9657894736842105, "grad_norm": 0.07862483655819683, "learning_rate": 0.00017146440466716991, "loss": 0.2856, "step": 734 }, { "epoch": 0.9671052631578947, "grad_norm": 0.08165975689746587, "learning_rate": 0.00017135722995540107, "loss": 0.2729, "step": 735 }, { "epoch": 0.968421052631579, "grad_norm": 0.07988073177015712, "learning_rate": 0.00017124988798716083, "loss": 0.2822, "step": 736 }, { "epoch": 0.9697368421052631, "grad_norm": 0.08138761359544663, "learning_rate": 0.00017114237901405134, "loss": 0.2642, "step": 737 }, { "epoch": 0.9710526315789474, "grad_norm": 0.08079811328305973, "learning_rate": 0.0001710347032880664, "loss": 0.2843, "step": 738 }, { "epoch": 0.9723684210526315, "grad_norm": 0.08408921154912564, "learning_rate": 0.00017092686106159053, "loss": 0.2924, "step": 739 }, { "epoch": 0.9736842105263158, "grad_norm": 0.0805385190287914, "learning_rate": 0.00017081885258739846, "loss": 0.2823, "step": 740 }, { "epoch": 0.975, "grad_norm": 0.08475021934413124, "learning_rate": 0.00017071067811865476, "loss": 0.2923, "step": 741 }, { "epoch": 0.9763157894736842, "grad_norm": 0.07950603456214735, "learning_rate": 0.00017060233790891296, "loss": 0.275, "step": 742 }, { "epoch": 0.9776315789473684, "grad_norm": 0.07785517335290744, "learning_rate": 0.0001704938322121151, "loss": 0.2831, "step": 743 }, { "epoch": 0.9789473684210527, "grad_norm": 0.07960926560941854, "learning_rate": 0.00017038516128259115, "loss": 0.2871, "step": 744 }, { "epoch": 0.9802631578947368, "grad_norm": 0.08293841281790956, "learning_rate": 0.00017027632537505832, "loss": 0.2753, "step": 745 }, { "epoch": 0.9815789473684211, "grad_norm": 0.07747002473316371, "learning_rate": 0.00017016732474462056, "loss": 0.2839, "step": 746 }, { "epoch": 0.9828947368421053, "grad_norm": 0.08046473275537684, "learning_rate": 0.00017005815964676787, "loss": 0.2768, "step": 747 }, { "epoch": 0.9842105263157894, "grad_norm": 0.08364061752605101, "learning_rate": 0.00016994883033737582, "loss": 0.2931, "step": 748 }, { "epoch": 0.9855263157894737, "grad_norm": 0.08050775331309325, "learning_rate": 0.0001698393370727048, "loss": 0.2893, "step": 749 }, { "epoch": 0.9868421052631579, "grad_norm": 0.08002147081513335, "learning_rate": 0.00016972968010939954, "loss": 0.2848, "step": 750 }, { "epoch": 0.9881578947368421, "grad_norm": 0.07808855249540297, "learning_rate": 0.0001696198597044885, "loss": 0.2846, "step": 751 }, { "epoch": 0.9894736842105263, "grad_norm": 0.07784201379245488, "learning_rate": 0.00016950987611538324, "loss": 0.2838, "step": 752 }, { "epoch": 0.9907894736842106, "grad_norm": 0.07950126160436796, "learning_rate": 0.0001693997295998777, "loss": 0.2933, "step": 753 }, { "epoch": 0.9921052631578947, "grad_norm": 0.08013885766603035, "learning_rate": 0.0001692894204161478, "loss": 0.2703, "step": 754 }, { "epoch": 0.993421052631579, "grad_norm": 0.08277409481218478, "learning_rate": 0.00016917894882275075, "loss": 0.2824, "step": 755 }, { "epoch": 0.9947368421052631, "grad_norm": 0.0816211725278549, "learning_rate": 0.00016906831507862443, "loss": 0.2781, "step": 756 }, { "epoch": 0.9960526315789474, "grad_norm": 0.08101924988182123, "learning_rate": 0.00016895751944308679, "loss": 0.284, "step": 757 }, { "epoch": 0.9973684210526316, "grad_norm": 0.08179919265807176, "learning_rate": 0.00016884656217583518, "loss": 0.2847, "step": 758 }, { "epoch": 0.9986842105263158, "grad_norm": 0.07951765088912952, "learning_rate": 0.00016873544353694588, "loss": 0.2869, "step": 759 }, { "epoch": 1.0, "grad_norm": 0.07809652892973841, "learning_rate": 0.0001686241637868734, "loss": 0.2809, "step": 760 }, { "epoch": 1.0, "eval_loss": 0.2806478440761566, "eval_runtime": 142.699, "eval_samples_per_second": 35.866, "eval_steps_per_second": 1.121, "step": 760 }, { "epoch": 1.0013157894736842, "grad_norm": 0.08089780903924765, "learning_rate": 0.0001685127231864498, "loss": 0.273, "step": 761 }, { "epoch": 1.0026315789473683, "grad_norm": 0.0791344049462135, "learning_rate": 0.00016840112199688432, "loss": 0.2746, "step": 762 }, { "epoch": 1.0039473684210527, "grad_norm": 0.07798719813102083, "learning_rate": 0.00016828936047976248, "loss": 0.2745, "step": 763 }, { "epoch": 1.0052631578947369, "grad_norm": 0.08227033417121575, "learning_rate": 0.00016817743889704565, "loss": 0.2617, "step": 764 }, { "epoch": 1.006578947368421, "grad_norm": 0.08639411451972336, "learning_rate": 0.00016806535751107037, "loss": 0.2674, "step": 765 }, { "epoch": 1.0078947368421052, "grad_norm": 0.08367139936296988, "learning_rate": 0.00016795311658454777, "loss": 0.279, "step": 766 }, { "epoch": 1.0092105263157896, "grad_norm": 0.08766002313298543, "learning_rate": 0.00016784071638056285, "loss": 0.283, "step": 767 }, { "epoch": 1.0105263157894737, "grad_norm": 0.08427710263543763, "learning_rate": 0.00016772815716257412, "loss": 0.2663, "step": 768 }, { "epoch": 1.0118421052631579, "grad_norm": 0.07801065155963043, "learning_rate": 0.0001676154391944126, "loss": 0.2559, "step": 769 }, { "epoch": 1.013157894736842, "grad_norm": 0.07698562715479776, "learning_rate": 0.00016750256274028152, "loss": 0.2748, "step": 770 }, { "epoch": 1.0144736842105264, "grad_norm": 0.08054717072786537, "learning_rate": 0.0001673895280647556, "loss": 0.2719, "step": 771 }, { "epoch": 1.0157894736842106, "grad_norm": 0.08399901308359886, "learning_rate": 0.0001672763354327804, "loss": 0.2735, "step": 772 }, { "epoch": 1.0171052631578947, "grad_norm": 0.07991338180666516, "learning_rate": 0.0001671629851096717, "loss": 0.2688, "step": 773 }, { "epoch": 1.018421052631579, "grad_norm": 0.0817891282862518, "learning_rate": 0.00016704947736111492, "loss": 0.2695, "step": 774 }, { "epoch": 1.019736842105263, "grad_norm": 0.084530202074618, "learning_rate": 0.00016693581245316442, "loss": 0.2667, "step": 775 }, { "epoch": 1.0210526315789474, "grad_norm": 0.08294234750514276, "learning_rate": 0.00016682199065224307, "loss": 0.2728, "step": 776 }, { "epoch": 1.0223684210526316, "grad_norm": 0.0832171550305921, "learning_rate": 0.00016670801222514134, "loss": 0.2778, "step": 777 }, { "epoch": 1.0236842105263158, "grad_norm": 0.08255803178852009, "learning_rate": 0.00016659387743901685, "loss": 0.2617, "step": 778 }, { "epoch": 1.025, "grad_norm": 0.08205321961071378, "learning_rate": 0.00016647958656139378, "loss": 0.2717, "step": 779 }, { "epoch": 1.0263157894736843, "grad_norm": 0.08258890569995453, "learning_rate": 0.00016636513986016213, "loss": 0.2658, "step": 780 }, { "epoch": 1.0276315789473685, "grad_norm": 0.08071843114802557, "learning_rate": 0.0001662505376035772, "loss": 0.2613, "step": 781 }, { "epoch": 1.0289473684210526, "grad_norm": 0.07886987321903878, "learning_rate": 0.00016613578006025872, "loss": 0.257, "step": 782 }, { "epoch": 1.0302631578947368, "grad_norm": 0.07877983241946228, "learning_rate": 0.00016602086749919063, "loss": 0.269, "step": 783 }, { "epoch": 1.0315789473684212, "grad_norm": 0.08280631625375447, "learning_rate": 0.0001659058001897201, "loss": 0.2687, "step": 784 }, { "epoch": 1.0328947368421053, "grad_norm": 0.08033744101706376, "learning_rate": 0.00016579057840155703, "loss": 0.2676, "step": 785 }, { "epoch": 1.0342105263157895, "grad_norm": 0.07853625991786962, "learning_rate": 0.00016567520240477344, "loss": 0.2475, "step": 786 }, { "epoch": 1.0355263157894736, "grad_norm": 0.08321225528586546, "learning_rate": 0.00016555967246980276, "loss": 0.2733, "step": 787 }, { "epoch": 1.0368421052631578, "grad_norm": 0.08271091060501498, "learning_rate": 0.00016544398886743933, "loss": 0.2669, "step": 788 }, { "epoch": 1.0381578947368422, "grad_norm": 0.08480813500178941, "learning_rate": 0.00016532815186883748, "loss": 0.2702, "step": 789 }, { "epoch": 1.0394736842105263, "grad_norm": 0.08423735368240373, "learning_rate": 0.0001652121617455113, "loss": 0.266, "step": 790 }, { "epoch": 1.0407894736842105, "grad_norm": 0.08270665791388272, "learning_rate": 0.00016509601876933374, "loss": 0.2748, "step": 791 }, { "epoch": 1.0421052631578946, "grad_norm": 0.08074688262914587, "learning_rate": 0.000164979723212536, "loss": 0.2681, "step": 792 }, { "epoch": 1.043421052631579, "grad_norm": 0.08329297167987026, "learning_rate": 0.0001648632753477068, "loss": 0.2737, "step": 793 }, { "epoch": 1.0447368421052632, "grad_norm": 0.08195503131845851, "learning_rate": 0.0001647466754477921, "loss": 0.2605, "step": 794 }, { "epoch": 1.0460526315789473, "grad_norm": 0.08219664275017928, "learning_rate": 0.00016462992378609407, "loss": 0.2726, "step": 795 }, { "epoch": 1.0473684210526315, "grad_norm": 0.081155677466978, "learning_rate": 0.00016451302063627066, "loss": 0.2683, "step": 796 }, { "epoch": 1.0486842105263159, "grad_norm": 0.07978496536582978, "learning_rate": 0.0001643959662723348, "loss": 0.2639, "step": 797 }, { "epoch": 1.05, "grad_norm": 0.07736573715270778, "learning_rate": 0.00016427876096865394, "loss": 0.2523, "step": 798 }, { "epoch": 1.0513157894736842, "grad_norm": 0.08461912280741266, "learning_rate": 0.0001641614049999493, "loss": 0.269, "step": 799 }, { "epoch": 1.0526315789473684, "grad_norm": 0.08431583636176626, "learning_rate": 0.00016404389864129533, "loss": 0.2563, "step": 800 }, { "epoch": 1.0539473684210527, "grad_norm": 0.08694565310464801, "learning_rate": 0.00016392624216811879, "loss": 0.2778, "step": 801 }, { "epoch": 1.055263157894737, "grad_norm": 0.08352264819913631, "learning_rate": 0.00016380843585619845, "loss": 0.2742, "step": 802 }, { "epoch": 1.056578947368421, "grad_norm": 0.08421963392115288, "learning_rate": 0.0001636904799816643, "loss": 0.271, "step": 803 }, { "epoch": 1.0578947368421052, "grad_norm": 0.07881294236501658, "learning_rate": 0.00016357237482099684, "loss": 0.2599, "step": 804 }, { "epoch": 1.0592105263157894, "grad_norm": 0.08602162394576658, "learning_rate": 0.0001634541206510264, "loss": 0.2729, "step": 805 }, { "epoch": 1.0605263157894738, "grad_norm": 0.08348451901525375, "learning_rate": 0.00016333571774893285, "loss": 0.2753, "step": 806 }, { "epoch": 1.061842105263158, "grad_norm": 0.0818737526419465, "learning_rate": 0.00016321716639224434, "loss": 0.2715, "step": 807 }, { "epoch": 1.063157894736842, "grad_norm": 0.08286281227398601, "learning_rate": 0.00016309846685883726, "loss": 0.2675, "step": 808 }, { "epoch": 1.0644736842105262, "grad_norm": 0.07966839196252047, "learning_rate": 0.00016297961942693512, "loss": 0.2558, "step": 809 }, { "epoch": 1.0657894736842106, "grad_norm": 0.08414413297914877, "learning_rate": 0.0001628606243751082, "loss": 0.2844, "step": 810 }, { "epoch": 1.0671052631578948, "grad_norm": 0.08555085876337333, "learning_rate": 0.00016274148198227282, "loss": 0.2851, "step": 811 }, { "epoch": 1.068421052631579, "grad_norm": 0.08082783425810476, "learning_rate": 0.00016262219252769064, "loss": 0.2646, "step": 812 }, { "epoch": 1.069736842105263, "grad_norm": 0.08197732764454009, "learning_rate": 0.00016250275629096786, "loss": 0.2779, "step": 813 }, { "epoch": 1.0710526315789473, "grad_norm": 0.08526782512435523, "learning_rate": 0.00016238317355205494, "loss": 0.2799, "step": 814 }, { "epoch": 1.0723684210526316, "grad_norm": 0.07823583584470825, "learning_rate": 0.00016226344459124566, "loss": 0.2643, "step": 815 }, { "epoch": 1.0736842105263158, "grad_norm": 0.08603151669795409, "learning_rate": 0.00016214356968917648, "loss": 0.273, "step": 816 }, { "epoch": 1.075, "grad_norm": 0.08851720457464476, "learning_rate": 0.000162023549126826, "loss": 0.2721, "step": 817 }, { "epoch": 1.0763157894736841, "grad_norm": 0.08133793366760693, "learning_rate": 0.00016190338318551427, "loss": 0.255, "step": 818 }, { "epoch": 1.0776315789473685, "grad_norm": 0.082819100026463, "learning_rate": 0.00016178307214690193, "loss": 0.2624, "step": 819 }, { "epoch": 1.0789473684210527, "grad_norm": 0.08530141351749654, "learning_rate": 0.00016166261629298995, "loss": 0.2659, "step": 820 }, { "epoch": 1.0802631578947368, "grad_norm": 0.08366548952005705, "learning_rate": 0.00016154201590611852, "loss": 0.2627, "step": 821 }, { "epoch": 1.081578947368421, "grad_norm": 0.08224916068858729, "learning_rate": 0.0001614212712689668, "loss": 0.2702, "step": 822 }, { "epoch": 1.0828947368421054, "grad_norm": 0.08321699362162292, "learning_rate": 0.0001613003826645519, "loss": 0.2668, "step": 823 }, { "epoch": 1.0842105263157895, "grad_norm": 0.07940176654017943, "learning_rate": 0.0001611793503762285, "loss": 0.2666, "step": 824 }, { "epoch": 1.0855263157894737, "grad_norm": 0.0810039782214103, "learning_rate": 0.00016105817468768798, "loss": 0.2748, "step": 825 }, { "epoch": 1.0868421052631578, "grad_norm": 0.08080799943317177, "learning_rate": 0.00016093685588295786, "loss": 0.2664, "step": 826 }, { "epoch": 1.0881578947368422, "grad_norm": 0.07963385945239661, "learning_rate": 0.00016081539424640118, "loss": 0.279, "step": 827 }, { "epoch": 1.0894736842105264, "grad_norm": 0.08180594063442842, "learning_rate": 0.00016069379006271566, "loss": 0.2735, "step": 828 }, { "epoch": 1.0907894736842105, "grad_norm": 0.08134763460779691, "learning_rate": 0.00016057204361693327, "loss": 0.2666, "step": 829 }, { "epoch": 1.0921052631578947, "grad_norm": 0.08433241545175918, "learning_rate": 0.0001604501551944193, "loss": 0.2582, "step": 830 }, { "epoch": 1.0934210526315788, "grad_norm": 0.08109132578451536, "learning_rate": 0.0001603281250808719, "loss": 0.2648, "step": 831 }, { "epoch": 1.0947368421052632, "grad_norm": 0.09007590757575104, "learning_rate": 0.00016020595356232135, "loss": 0.2681, "step": 832 }, { "epoch": 1.0960526315789474, "grad_norm": 0.08677922191238775, "learning_rate": 0.00016008364092512926, "loss": 0.2761, "step": 833 }, { "epoch": 1.0973684210526315, "grad_norm": 0.08575921288664821, "learning_rate": 0.00015996118745598817, "loss": 0.2696, "step": 834 }, { "epoch": 1.0986842105263157, "grad_norm": 0.08041317041434254, "learning_rate": 0.00015983859344192061, "loss": 0.2689, "step": 835 }, { "epoch": 1.1, "grad_norm": 0.08065596311704078, "learning_rate": 0.00015971585917027862, "loss": 0.2687, "step": 836 }, { "epoch": 1.1013157894736842, "grad_norm": 0.0790113706176156, "learning_rate": 0.00015959298492874288, "loss": 0.2555, "step": 837 }, { "epoch": 1.1026315789473684, "grad_norm": 0.08012262373177113, "learning_rate": 0.0001594699710053223, "loss": 0.2685, "step": 838 }, { "epoch": 1.1039473684210526, "grad_norm": 0.08504138731346632, "learning_rate": 0.00015934681768835297, "loss": 0.2805, "step": 839 }, { "epoch": 1.1052631578947367, "grad_norm": 0.08095254397891562, "learning_rate": 0.00015922352526649803, "loss": 0.2709, "step": 840 }, { "epoch": 1.106578947368421, "grad_norm": 0.08674994051804193, "learning_rate": 0.00015910009402874631, "loss": 0.2806, "step": 841 }, { "epoch": 1.1078947368421053, "grad_norm": 0.09194758435288344, "learning_rate": 0.0001589765242644124, "loss": 0.2736, "step": 842 }, { "epoch": 1.1092105263157894, "grad_norm": 0.08318187378912749, "learning_rate": 0.00015885281626313517, "loss": 0.2606, "step": 843 }, { "epoch": 1.1105263157894736, "grad_norm": 0.0825846308272865, "learning_rate": 0.00015872897031487791, "loss": 0.2628, "step": 844 }, { "epoch": 1.111842105263158, "grad_norm": 0.08112434803943486, "learning_rate": 0.00015860498670992691, "loss": 0.2793, "step": 845 }, { "epoch": 1.1131578947368421, "grad_norm": 0.08537425779356113, "learning_rate": 0.00015848086573889137, "loss": 0.2779, "step": 846 }, { "epoch": 1.1144736842105263, "grad_norm": 0.08315694519509176, "learning_rate": 0.00015835660769270232, "loss": 0.2719, "step": 847 }, { "epoch": 1.1157894736842104, "grad_norm": 0.08130780896158304, "learning_rate": 0.00015823221286261215, "loss": 0.2653, "step": 848 }, { "epoch": 1.1171052631578948, "grad_norm": 0.08219491647444532, "learning_rate": 0.00015810768154019385, "loss": 0.2765, "step": 849 }, { "epoch": 1.118421052631579, "grad_norm": 0.08240059160474296, "learning_rate": 0.0001579830140173403, "loss": 0.2737, "step": 850 }, { "epoch": 1.1197368421052631, "grad_norm": 0.083577356578667, "learning_rate": 0.00015785821058626366, "loss": 0.2661, "step": 851 }, { "epoch": 1.1210526315789473, "grad_norm": 0.07834322980894085, "learning_rate": 0.00015773327153949465, "loss": 0.2729, "step": 852 }, { "epoch": 1.1223684210526317, "grad_norm": 0.08346738623736531, "learning_rate": 0.00015760819716988187, "loss": 0.275, "step": 853 }, { "epoch": 1.1236842105263158, "grad_norm": 0.07884897152428384, "learning_rate": 0.00015748298777059112, "loss": 0.2805, "step": 854 }, { "epoch": 1.125, "grad_norm": 0.08020439719046484, "learning_rate": 0.0001573576436351046, "loss": 0.2587, "step": 855 }, { "epoch": 1.1263157894736842, "grad_norm": 0.08327214298097156, "learning_rate": 0.0001572321650572205, "loss": 0.2626, "step": 856 }, { "epoch": 1.1276315789473683, "grad_norm": 0.08372000718554913, "learning_rate": 0.00015710655233105194, "loss": 0.2743, "step": 857 }, { "epoch": 1.1289473684210527, "grad_norm": 0.08197882426728544, "learning_rate": 0.00015698080575102661, "loss": 0.2657, "step": 858 }, { "epoch": 1.1302631578947369, "grad_norm": 0.08271785330561393, "learning_rate": 0.00015685492561188594, "loss": 0.2696, "step": 859 }, { "epoch": 1.131578947368421, "grad_norm": 0.08153102919965115, "learning_rate": 0.00015672891220868432, "loss": 0.2627, "step": 860 }, { "epoch": 1.1328947368421052, "grad_norm": 0.08273222452553464, "learning_rate": 0.00015660276583678853, "loss": 0.2742, "step": 861 }, { "epoch": 1.1342105263157896, "grad_norm": 0.08149761866108174, "learning_rate": 0.0001564764867918771, "loss": 0.284, "step": 862 }, { "epoch": 1.1355263157894737, "grad_norm": 0.07732037718328716, "learning_rate": 0.0001563500753699395, "loss": 0.2559, "step": 863 }, { "epoch": 1.1368421052631579, "grad_norm": 0.08033074721619064, "learning_rate": 0.00015622353186727544, "loss": 0.2651, "step": 864 }, { "epoch": 1.138157894736842, "grad_norm": 0.07988808525323399, "learning_rate": 0.0001560968565804942, "loss": 0.2676, "step": 865 }, { "epoch": 1.1394736842105262, "grad_norm": 0.07997455834407387, "learning_rate": 0.00015597004980651407, "loss": 0.2597, "step": 866 }, { "epoch": 1.1407894736842106, "grad_norm": 0.07985601487062838, "learning_rate": 0.0001558431118425614, "loss": 0.257, "step": 867 }, { "epoch": 1.1421052631578947, "grad_norm": 0.08269154375775314, "learning_rate": 0.0001557160429861702, "loss": 0.2807, "step": 868 }, { "epoch": 1.143421052631579, "grad_norm": 0.08357702138558408, "learning_rate": 0.00015558884353518107, "loss": 0.2767, "step": 869 }, { "epoch": 1.1447368421052633, "grad_norm": 0.08414408194028052, "learning_rate": 0.00015546151378774086, "loss": 0.274, "step": 870 }, { "epoch": 1.1460526315789474, "grad_norm": 0.08701268358835322, "learning_rate": 0.00015533405404230188, "loss": 0.2811, "step": 871 }, { "epoch": 1.1473684210526316, "grad_norm": 0.08262282790058861, "learning_rate": 0.000155206464597621, "loss": 0.2731, "step": 872 }, { "epoch": 1.1486842105263158, "grad_norm": 0.08408606653507734, "learning_rate": 0.00015507874575275917, "loss": 0.251, "step": 873 }, { "epoch": 1.15, "grad_norm": 0.08049323373059812, "learning_rate": 0.0001549508978070806, "loss": 0.2715, "step": 874 }, { "epoch": 1.1513157894736843, "grad_norm": 0.07972519186854046, "learning_rate": 0.0001548229210602522, "loss": 0.2671, "step": 875 }, { "epoch": 1.1526315789473685, "grad_norm": 0.07882804213132247, "learning_rate": 0.00015469481581224272, "loss": 0.2619, "step": 876 }, { "epoch": 1.1539473684210526, "grad_norm": 0.0798538301353752, "learning_rate": 0.00015456658236332203, "loss": 0.2736, "step": 877 }, { "epoch": 1.1552631578947368, "grad_norm": 0.08315715083917044, "learning_rate": 0.00015443822101406064, "loss": 0.2664, "step": 878 }, { "epoch": 1.1565789473684212, "grad_norm": 0.08372404146892015, "learning_rate": 0.00015430973206532878, "loss": 0.2686, "step": 879 }, { "epoch": 1.1578947368421053, "grad_norm": 0.08175706218650852, "learning_rate": 0.00015418111581829574, "loss": 0.2622, "step": 880 }, { "epoch": 1.1592105263157895, "grad_norm": 0.07882686056716785, "learning_rate": 0.00015405237257442924, "loss": 0.269, "step": 881 }, { "epoch": 1.1605263157894736, "grad_norm": 0.07950928567287992, "learning_rate": 0.0001539235026354946, "loss": 0.2628, "step": 882 }, { "epoch": 1.1618421052631578, "grad_norm": 0.0850340095145124, "learning_rate": 0.00015379450630355424, "loss": 0.2791, "step": 883 }, { "epoch": 1.1631578947368422, "grad_norm": 0.0941613058127184, "learning_rate": 0.0001536653838809667, "loss": 0.279, "step": 884 }, { "epoch": 1.1644736842105263, "grad_norm": 0.08251843104601717, "learning_rate": 0.00015353613567038607, "loss": 0.2859, "step": 885 }, { "epoch": 1.1657894736842105, "grad_norm": 0.08038962867293699, "learning_rate": 0.0001534067619747614, "loss": 0.2747, "step": 886 }, { "epoch": 1.1671052631578946, "grad_norm": 0.0794561388070862, "learning_rate": 0.00015327726309733572, "loss": 0.2686, "step": 887 }, { "epoch": 1.168421052631579, "grad_norm": 0.07884245258276419, "learning_rate": 0.0001531476393416456, "loss": 0.2672, "step": 888 }, { "epoch": 1.1697368421052632, "grad_norm": 0.08136001457411983, "learning_rate": 0.00015301789101152026, "loss": 0.2672, "step": 889 }, { "epoch": 1.1710526315789473, "grad_norm": 0.08416373900750189, "learning_rate": 0.00015288801841108093, "loss": 0.2823, "step": 890 }, { "epoch": 1.1723684210526315, "grad_norm": 0.0820367167436313, "learning_rate": 0.0001527580218447401, "loss": 0.2578, "step": 891 }, { "epoch": 1.1736842105263159, "grad_norm": 0.08309627633941746, "learning_rate": 0.0001526279016172008, "loss": 0.267, "step": 892 }, { "epoch": 1.175, "grad_norm": 0.08128917587224013, "learning_rate": 0.000152497658033456, "loss": 0.2561, "step": 893 }, { "epoch": 1.1763157894736842, "grad_norm": 0.08178638667577875, "learning_rate": 0.00015236729139878782, "loss": 0.2768, "step": 894 }, { "epoch": 1.1776315789473684, "grad_norm": 0.0801661892105483, "learning_rate": 0.0001522368020187666, "loss": 0.2543, "step": 895 }, { "epoch": 1.1789473684210527, "grad_norm": 0.08213132733512518, "learning_rate": 0.00015210619019925066, "loss": 0.2636, "step": 896 }, { "epoch": 1.180263157894737, "grad_norm": 0.08080981772989361, "learning_rate": 0.00015197545624638504, "loss": 0.267, "step": 897 }, { "epoch": 1.181578947368421, "grad_norm": 0.07879728180506462, "learning_rate": 0.00015184460046660137, "loss": 0.2622, "step": 898 }, { "epoch": 1.1828947368421052, "grad_norm": 0.07930675022154604, "learning_rate": 0.00015171362316661652, "loss": 0.2635, "step": 899 }, { "epoch": 1.1842105263157894, "grad_norm": 0.0827303133693508, "learning_rate": 0.00015158252465343242, "loss": 0.2731, "step": 900 }, { "epoch": 1.1855263157894738, "grad_norm": 0.08133340547715927, "learning_rate": 0.00015145130523433492, "loss": 0.2691, "step": 901 }, { "epoch": 1.186842105263158, "grad_norm": 0.08320419791917552, "learning_rate": 0.00015131996521689352, "loss": 0.2831, "step": 902 }, { "epoch": 1.188157894736842, "grad_norm": 0.078583719040117, "learning_rate": 0.00015118850490896012, "loss": 0.2605, "step": 903 }, { "epoch": 1.1894736842105262, "grad_norm": 0.07987941146339982, "learning_rate": 0.00015105692461866874, "loss": 0.271, "step": 904 }, { "epoch": 1.1907894736842106, "grad_norm": 0.08065385126525215, "learning_rate": 0.0001509252246544346, "loss": 0.2748, "step": 905 }, { "epoch": 1.1921052631578948, "grad_norm": 0.08045853100208103, "learning_rate": 0.00015079340532495343, "loss": 0.2793, "step": 906 }, { "epoch": 1.193421052631579, "grad_norm": 0.07925415373688983, "learning_rate": 0.00015066146693920072, "loss": 0.2625, "step": 907 }, { "epoch": 1.194736842105263, "grad_norm": 0.08327329284089614, "learning_rate": 0.000150529409806431, "loss": 0.2679, "step": 908 }, { "epoch": 1.1960526315789473, "grad_norm": 0.08369037313818105, "learning_rate": 0.0001503972342361772, "loss": 0.2729, "step": 909 }, { "epoch": 1.1973684210526316, "grad_norm": 0.08698590728820178, "learning_rate": 0.00015026494053824982, "loss": 0.2792, "step": 910 }, { "epoch": 1.1986842105263158, "grad_norm": 0.08234237296181915, "learning_rate": 0.0001501325290227362, "loss": 0.2723, "step": 911 }, { "epoch": 1.2, "grad_norm": 0.08018132065204943, "learning_rate": 0.00015000000000000001, "loss": 0.2582, "step": 912 }, { "epoch": 1.2013157894736841, "grad_norm": 0.08029223756152773, "learning_rate": 0.0001498673537806801, "loss": 0.2605, "step": 913 }, { "epoch": 1.2026315789473685, "grad_norm": 0.07906404673525916, "learning_rate": 0.00014973459067569022, "loss": 0.2691, "step": 914 }, { "epoch": 1.2039473684210527, "grad_norm": 0.0838278381246816, "learning_rate": 0.00014960171099621795, "loss": 0.2697, "step": 915 }, { "epoch": 1.2052631578947368, "grad_norm": 0.08289248581844369, "learning_rate": 0.00014946871505372425, "loss": 0.2789, "step": 916 }, { "epoch": 1.206578947368421, "grad_norm": 0.08065473943197897, "learning_rate": 0.0001493356031599425, "loss": 0.2741, "step": 917 }, { "epoch": 1.2078947368421054, "grad_norm": 0.08222068877129116, "learning_rate": 0.00014920237562687785, "loss": 0.268, "step": 918 }, { "epoch": 1.2092105263157895, "grad_norm": 0.0819456615892803, "learning_rate": 0.00014906903276680654, "loss": 0.267, "step": 919 }, { "epoch": 1.2105263157894737, "grad_norm": 0.08178515229941181, "learning_rate": 0.00014893557489227517, "loss": 0.2711, "step": 920 }, { "epoch": 1.2118421052631578, "grad_norm": 0.08167974669157664, "learning_rate": 0.00014880200231609983, "loss": 0.2623, "step": 921 }, { "epoch": 1.2131578947368422, "grad_norm": 0.07644033108650616, "learning_rate": 0.00014866831535136554, "loss": 0.2592, "step": 922 }, { "epoch": 1.2144736842105264, "grad_norm": 0.08247670698556289, "learning_rate": 0.00014853451431142537, "loss": 0.2563, "step": 923 }, { "epoch": 1.2157894736842105, "grad_norm": 0.08121309159730847, "learning_rate": 0.0001484005995098999, "loss": 0.2651, "step": 924 }, { "epoch": 1.2171052631578947, "grad_norm": 0.0813185941928777, "learning_rate": 0.0001482665712606762, "loss": 0.2654, "step": 925 }, { "epoch": 1.2184210526315788, "grad_norm": 0.07992066628575906, "learning_rate": 0.00014813242987790734, "loss": 0.2601, "step": 926 }, { "epoch": 1.2197368421052632, "grad_norm": 0.08348022762772563, "learning_rate": 0.00014799817567601157, "loss": 0.2698, "step": 927 }, { "epoch": 1.2210526315789474, "grad_norm": 0.08164215203446346, "learning_rate": 0.0001478638089696716, "loss": 0.261, "step": 928 }, { "epoch": 1.2223684210526315, "grad_norm": 0.08053428830419256, "learning_rate": 0.00014772933007383372, "loss": 0.271, "step": 929 }, { "epoch": 1.2236842105263157, "grad_norm": 0.0855332022802314, "learning_rate": 0.00014759473930370736, "loss": 0.2806, "step": 930 }, { "epoch": 1.225, "grad_norm": 0.07841552340055463, "learning_rate": 0.00014746003697476404, "loss": 0.2562, "step": 931 }, { "epoch": 1.2263157894736842, "grad_norm": 0.08012970553706716, "learning_rate": 0.00014732522340273684, "loss": 0.2653, "step": 932 }, { "epoch": 1.2276315789473684, "grad_norm": 0.08013137552721004, "learning_rate": 0.00014719029890361955, "loss": 0.2614, "step": 933 }, { "epoch": 1.2289473684210526, "grad_norm": 0.08076902086359555, "learning_rate": 0.000147055263793666, "loss": 0.2726, "step": 934 }, { "epoch": 1.2302631578947367, "grad_norm": 0.07816751188820396, "learning_rate": 0.0001469201183893892, "loss": 0.2697, "step": 935 }, { "epoch": 1.231578947368421, "grad_norm": 0.07958991637658457, "learning_rate": 0.0001467848630075608, "loss": 0.2718, "step": 936 }, { "epoch": 1.2328947368421053, "grad_norm": 0.07907729388333563, "learning_rate": 0.00014664949796521013, "loss": 0.2542, "step": 937 }, { "epoch": 1.2342105263157894, "grad_norm": 0.08337394014823166, "learning_rate": 0.00014651402357962367, "loss": 0.2634, "step": 938 }, { "epoch": 1.2355263157894738, "grad_norm": 0.07981852744305896, "learning_rate": 0.00014637844016834406, "loss": 0.2669, "step": 939 }, { "epoch": 1.236842105263158, "grad_norm": 0.07805563922628504, "learning_rate": 0.00014624274804916958, "loss": 0.2604, "step": 940 }, { "epoch": 1.2381578947368421, "grad_norm": 0.07875476925853715, "learning_rate": 0.00014610694754015326, "loss": 0.2605, "step": 941 }, { "epoch": 1.2394736842105263, "grad_norm": 0.08212032555073835, "learning_rate": 0.00014597103895960226, "loss": 0.2608, "step": 942 }, { "epoch": 1.2407894736842104, "grad_norm": 0.08193583484443283, "learning_rate": 0.00014583502262607696, "loss": 0.2671, "step": 943 }, { "epoch": 1.2421052631578948, "grad_norm": 0.08018625970236232, "learning_rate": 0.00014569889885839037, "loss": 0.2482, "step": 944 }, { "epoch": 1.243421052631579, "grad_norm": 0.08080487708321592, "learning_rate": 0.00014556266797560732, "loss": 0.2581, "step": 945 }, { "epoch": 1.2447368421052631, "grad_norm": 0.08127791305985531, "learning_rate": 0.00014542633029704366, "loss": 0.2645, "step": 946 }, { "epoch": 1.2460526315789473, "grad_norm": 0.0789744299917556, "learning_rate": 0.00014528988614226563, "loss": 0.258, "step": 947 }, { "epoch": 1.2473684210526317, "grad_norm": 0.08138536304735684, "learning_rate": 0.00014515333583108896, "loss": 0.2602, "step": 948 }, { "epoch": 1.2486842105263158, "grad_norm": 0.08347086931054207, "learning_rate": 0.00014501667968357825, "loss": 0.2737, "step": 949 }, { "epoch": 1.25, "grad_norm": 0.07961359280977227, "learning_rate": 0.00014487991802004623, "loss": 0.269, "step": 950 }, { "epoch": 1.2513157894736842, "grad_norm": 0.08326297319920578, "learning_rate": 0.00014474305116105284, "loss": 0.2627, "step": 951 }, { "epoch": 1.2526315789473683, "grad_norm": 0.08085513791177462, "learning_rate": 0.00014460607942740468, "loss": 0.2679, "step": 952 }, { "epoch": 1.2539473684210527, "grad_norm": 0.07868241470750258, "learning_rate": 0.00014446900314015411, "loss": 0.2699, "step": 953 }, { "epoch": 1.2552631578947369, "grad_norm": 0.0836320162619728, "learning_rate": 0.0001443318226205986, "loss": 0.2792, "step": 954 }, { "epoch": 1.256578947368421, "grad_norm": 0.08451883775470095, "learning_rate": 0.00014419453819027988, "loss": 0.2739, "step": 955 }, { "epoch": 1.2578947368421054, "grad_norm": 0.08034662053541705, "learning_rate": 0.00014405715017098335, "loss": 0.263, "step": 956 }, { "epoch": 1.2592105263157896, "grad_norm": 0.07928448098795568, "learning_rate": 0.00014391965888473703, "loss": 0.2543, "step": 957 }, { "epoch": 1.2605263157894737, "grad_norm": 0.08392429826580132, "learning_rate": 0.0001437820646538112, "loss": 0.2775, "step": 958 }, { "epoch": 1.2618421052631579, "grad_norm": 0.07940539289961981, "learning_rate": 0.00014364436780071727, "loss": 0.2685, "step": 959 }, { "epoch": 1.263157894736842, "grad_norm": 0.08036645877207896, "learning_rate": 0.00014350656864820733, "loss": 0.2683, "step": 960 }, { "epoch": 1.2644736842105262, "grad_norm": 0.07812397442869837, "learning_rate": 0.0001433686675192731, "loss": 0.2639, "step": 961 }, { "epoch": 1.2657894736842106, "grad_norm": 0.07914775180391949, "learning_rate": 0.00014323066473714542, "loss": 0.2612, "step": 962 }, { "epoch": 1.2671052631578947, "grad_norm": 0.07880507910786637, "learning_rate": 0.00014309256062529344, "loss": 0.269, "step": 963 }, { "epoch": 1.268421052631579, "grad_norm": 0.08099065681511952, "learning_rate": 0.0001429543555074237, "loss": 0.2649, "step": 964 }, { "epoch": 1.2697368421052633, "grad_norm": 0.07889461321025316, "learning_rate": 0.00014281604970747954, "loss": 0.2618, "step": 965 }, { "epoch": 1.2710526315789474, "grad_norm": 0.07976753992813429, "learning_rate": 0.00014267764354964038, "loss": 0.2706, "step": 966 }, { "epoch": 1.2723684210526316, "grad_norm": 0.07872266626170443, "learning_rate": 0.00014253913735832075, "loss": 0.2642, "step": 967 }, { "epoch": 1.2736842105263158, "grad_norm": 0.08023355760105522, "learning_rate": 0.00014240053145816967, "loss": 0.2568, "step": 968 }, { "epoch": 1.275, "grad_norm": 0.0810413485993457, "learning_rate": 0.00014226182617406996, "loss": 0.2529, "step": 969 }, { "epoch": 1.2763157894736843, "grad_norm": 0.08212814628024637, "learning_rate": 0.00014212302183113732, "loss": 0.2629, "step": 970 }, { "epoch": 1.2776315789473685, "grad_norm": 0.08059136593229341, "learning_rate": 0.00014198411875471955, "loss": 0.2642, "step": 971 }, { "epoch": 1.2789473684210526, "grad_norm": 0.08140236020647386, "learning_rate": 0.00014184511727039612, "loss": 0.2562, "step": 972 }, { "epoch": 1.2802631578947368, "grad_norm": 0.08437007742270067, "learning_rate": 0.00014170601770397692, "loss": 0.2745, "step": 973 }, { "epoch": 1.2815789473684212, "grad_norm": 0.08380982717641713, "learning_rate": 0.00014156682038150183, "loss": 0.2759, "step": 974 }, { "epoch": 1.2828947368421053, "grad_norm": 0.0841772070431119, "learning_rate": 0.00014142752562923988, "loss": 0.2753, "step": 975 }, { "epoch": 1.2842105263157895, "grad_norm": 0.08106806985375134, "learning_rate": 0.0001412881337736885, "loss": 0.2767, "step": 976 }, { "epoch": 1.2855263157894736, "grad_norm": 0.0811005519720408, "learning_rate": 0.00014114864514157258, "loss": 0.2683, "step": 977 }, { "epoch": 1.2868421052631578, "grad_norm": 0.08017428089067932, "learning_rate": 0.00014100906005984403, "loss": 0.2723, "step": 978 }, { "epoch": 1.2881578947368422, "grad_norm": 0.07478870745109191, "learning_rate": 0.00014086937885568067, "loss": 0.2593, "step": 979 }, { "epoch": 1.2894736842105263, "grad_norm": 0.07742432071526005, "learning_rate": 0.00014072960185648577, "loss": 0.2631, "step": 980 }, { "epoch": 1.2907894736842105, "grad_norm": 0.07794666505896067, "learning_rate": 0.000140589729389887, "loss": 0.2644, "step": 981 }, { "epoch": 1.2921052631578949, "grad_norm": 0.07946767921967093, "learning_rate": 0.0001404497617837359, "loss": 0.2609, "step": 982 }, { "epoch": 1.293421052631579, "grad_norm": 0.08082944776271564, "learning_rate": 0.00014030969936610697, "loss": 0.2734, "step": 983 }, { "epoch": 1.2947368421052632, "grad_norm": 0.08360749789540052, "learning_rate": 0.00014016954246529696, "loss": 0.263, "step": 984 }, { "epoch": 1.2960526315789473, "grad_norm": 0.08375341936105894, "learning_rate": 0.00014002929140982404, "loss": 0.2639, "step": 985 }, { "epoch": 1.2973684210526315, "grad_norm": 0.08336578005654308, "learning_rate": 0.00013988894652842713, "loss": 0.2777, "step": 986 }, { "epoch": 1.2986842105263157, "grad_norm": 0.07859275037760449, "learning_rate": 0.00013974850815006503, "loss": 0.2609, "step": 987 }, { "epoch": 1.3, "grad_norm": 0.07972935132468843, "learning_rate": 0.0001396079766039157, "loss": 0.2593, "step": 988 }, { "epoch": 1.3013157894736842, "grad_norm": 0.07852889738678417, "learning_rate": 0.0001394673522193755, "loss": 0.2651, "step": 989 }, { "epoch": 1.3026315789473684, "grad_norm": 0.08131207293814138, "learning_rate": 0.0001393266353260583, "loss": 0.2636, "step": 990 }, { "epoch": 1.3039473684210527, "grad_norm": 0.08085167622175171, "learning_rate": 0.00013918582625379501, "loss": 0.2581, "step": 991 }, { "epoch": 1.305263157894737, "grad_norm": 0.08310768520116221, "learning_rate": 0.00013904492533263244, "loss": 0.2804, "step": 992 }, { "epoch": 1.306578947368421, "grad_norm": 0.0835988231304127, "learning_rate": 0.0001389039328928326, "loss": 0.2764, "step": 993 }, { "epoch": 1.3078947368421052, "grad_norm": 0.08043076516824947, "learning_rate": 0.0001387628492648723, "loss": 0.2643, "step": 994 }, { "epoch": 1.3092105263157894, "grad_norm": 0.08124742205143656, "learning_rate": 0.00013862167477944184, "loss": 0.2779, "step": 995 }, { "epoch": 1.3105263157894738, "grad_norm": 0.07855989514944883, "learning_rate": 0.00013848040976744457, "loss": 0.2641, "step": 996 }, { "epoch": 1.311842105263158, "grad_norm": 0.08038296983596055, "learning_rate": 0.00013833905455999603, "loss": 0.2611, "step": 997 }, { "epoch": 1.313157894736842, "grad_norm": 0.07754716363033522, "learning_rate": 0.0001381976094884232, "loss": 0.2565, "step": 998 }, { "epoch": 1.3144736842105262, "grad_norm": 0.0771054502214774, "learning_rate": 0.00013805607488426362, "loss": 0.2667, "step": 999 }, { "epoch": 1.3157894736842106, "grad_norm": 0.08092253741118172, "learning_rate": 0.00013791445107926478, "loss": 0.2715, "step": 1000 }, { "epoch": 1.3171052631578948, "grad_norm": 0.08392681822232567, "learning_rate": 0.00013777273840538312, "loss": 0.2696, "step": 1001 }, { "epoch": 1.318421052631579, "grad_norm": 0.08088919172679297, "learning_rate": 0.00013763093719478358, "loss": 0.262, "step": 1002 }, { "epoch": 1.319736842105263, "grad_norm": 0.07966639118780608, "learning_rate": 0.00013748904777983838, "loss": 0.2632, "step": 1003 }, { "epoch": 1.3210526315789473, "grad_norm": 0.07949592188633048, "learning_rate": 0.00013734707049312673, "loss": 0.2513, "step": 1004 }, { "epoch": 1.3223684210526316, "grad_norm": 0.078682262346058, "learning_rate": 0.00013720500566743362, "loss": 0.2567, "step": 1005 }, { "epoch": 1.3236842105263158, "grad_norm": 0.07844946772992165, "learning_rate": 0.00013706285363574932, "loss": 0.2743, "step": 1006 }, { "epoch": 1.325, "grad_norm": 0.08305320278558118, "learning_rate": 0.00013692061473126845, "loss": 0.2623, "step": 1007 }, { "epoch": 1.3263157894736843, "grad_norm": 0.08139900273296097, "learning_rate": 0.00013677828928738934, "loss": 0.269, "step": 1008 }, { "epoch": 1.3276315789473685, "grad_norm": 0.0839606346260752, "learning_rate": 0.00013663587763771304, "loss": 0.2671, "step": 1009 }, { "epoch": 1.3289473684210527, "grad_norm": 0.08150284952029307, "learning_rate": 0.0001364933801160428, "loss": 0.2568, "step": 1010 }, { "epoch": 1.3302631578947368, "grad_norm": 0.07748260538941765, "learning_rate": 0.00013635079705638298, "loss": 0.2423, "step": 1011 }, { "epoch": 1.331578947368421, "grad_norm": 0.07721123482181354, "learning_rate": 0.00013620812879293863, "loss": 0.2621, "step": 1012 }, { "epoch": 1.3328947368421051, "grad_norm": 0.08010424676918745, "learning_rate": 0.00013606537566011434, "loss": 0.2643, "step": 1013 }, { "epoch": 1.3342105263157895, "grad_norm": 0.07968708879421524, "learning_rate": 0.00013592253799251376, "loss": 0.2677, "step": 1014 }, { "epoch": 1.3355263157894737, "grad_norm": 0.07865457628121457, "learning_rate": 0.00013577961612493852, "loss": 0.2646, "step": 1015 }, { "epoch": 1.3368421052631578, "grad_norm": 0.07821958051935139, "learning_rate": 0.00013563661039238785, "loss": 0.2605, "step": 1016 }, { "epoch": 1.3381578947368422, "grad_norm": 0.07970724490030985, "learning_rate": 0.00013549352113005728, "loss": 0.2591, "step": 1017 }, { "epoch": 1.3394736842105264, "grad_norm": 0.08021459794844776, "learning_rate": 0.00013535034867333837, "loss": 0.2684, "step": 1018 }, { "epoch": 1.3407894736842105, "grad_norm": 0.08161684018773531, "learning_rate": 0.00013520709335781752, "loss": 0.2579, "step": 1019 }, { "epoch": 1.3421052631578947, "grad_norm": 0.0780970363121174, "learning_rate": 0.00013506375551927547, "loss": 0.258, "step": 1020 }, { "epoch": 1.3434210526315788, "grad_norm": 0.08023973869702199, "learning_rate": 0.00013492033549368618, "loss": 0.2758, "step": 1021 }, { "epoch": 1.3447368421052632, "grad_norm": 0.081261194750605, "learning_rate": 0.00013477683361721657, "loss": 0.2579, "step": 1022 }, { "epoch": 1.3460526315789474, "grad_norm": 0.07810232344712505, "learning_rate": 0.00013463325022622507, "loss": 0.2597, "step": 1023 }, { "epoch": 1.3473684210526315, "grad_norm": 0.07656812032411678, "learning_rate": 0.00013448958565726144, "loss": 0.2497, "step": 1024 }, { "epoch": 1.3486842105263157, "grad_norm": 0.07962640765722634, "learning_rate": 0.00013434584024706554, "loss": 0.2703, "step": 1025 }, { "epoch": 1.35, "grad_norm": 0.07859791053686446, "learning_rate": 0.00013420201433256689, "loss": 0.2643, "step": 1026 }, { "epoch": 1.3513157894736842, "grad_norm": 0.0785017433196338, "learning_rate": 0.00013405810825088351, "loss": 0.2554, "step": 1027 }, { "epoch": 1.3526315789473684, "grad_norm": 0.07927097278835697, "learning_rate": 0.00013391412233932149, "loss": 0.2645, "step": 1028 }, { "epoch": 1.3539473684210526, "grad_norm": 0.08028070891011724, "learning_rate": 0.0001337700569353739, "loss": 0.259, "step": 1029 }, { "epoch": 1.3552631578947367, "grad_norm": 0.07903933780122868, "learning_rate": 0.0001336259123767203, "loss": 0.2687, "step": 1030 }, { "epoch": 1.356578947368421, "grad_norm": 0.08789904311515143, "learning_rate": 0.00013348168900122559, "loss": 0.2653, "step": 1031 }, { "epoch": 1.3578947368421053, "grad_norm": 0.0762050013831853, "learning_rate": 0.00013333738714693956, "loss": 0.2526, "step": 1032 }, { "epoch": 1.3592105263157894, "grad_norm": 0.08083459753449757, "learning_rate": 0.00013319300715209587, "loss": 0.2687, "step": 1033 }, { "epoch": 1.3605263157894738, "grad_norm": 0.0771191494993079, "learning_rate": 0.00013304854935511143, "loss": 0.2659, "step": 1034 }, { "epoch": 1.361842105263158, "grad_norm": 0.0810612005537591, "learning_rate": 0.00013290401409458532, "loss": 0.2587, "step": 1035 }, { "epoch": 1.3631578947368421, "grad_norm": 0.08077515208209074, "learning_rate": 0.00013275940170929843, "loss": 0.2763, "step": 1036 }, { "epoch": 1.3644736842105263, "grad_norm": 0.0795071805895556, "learning_rate": 0.00013261471253821227, "loss": 0.2803, "step": 1037 }, { "epoch": 1.3657894736842104, "grad_norm": 0.0783867946653792, "learning_rate": 0.00013246994692046836, "loss": 0.2659, "step": 1038 }, { "epoch": 1.3671052631578946, "grad_norm": 0.07705812242711711, "learning_rate": 0.0001323251051953874, "loss": 0.2626, "step": 1039 }, { "epoch": 1.368421052631579, "grad_norm": 0.07981528649898714, "learning_rate": 0.00013218018770246858, "loss": 0.2652, "step": 1040 }, { "epoch": 1.3697368421052631, "grad_norm": 0.07580835271521758, "learning_rate": 0.00013203519478138852, "loss": 0.2593, "step": 1041 }, { "epoch": 1.3710526315789473, "grad_norm": 0.0786805131510188, "learning_rate": 0.00013189012677200073, "loss": 0.2583, "step": 1042 }, { "epoch": 1.3723684210526317, "grad_norm": 0.07872461104976057, "learning_rate": 0.00013174498401433474, "loss": 0.2542, "step": 1043 }, { "epoch": 1.3736842105263158, "grad_norm": 0.08393125286330598, "learning_rate": 0.00013159976684859527, "loss": 0.2717, "step": 1044 }, { "epoch": 1.375, "grad_norm": 0.08038226441935513, "learning_rate": 0.00013145447561516138, "loss": 0.2538, "step": 1045 }, { "epoch": 1.3763157894736842, "grad_norm": 0.08104956663295539, "learning_rate": 0.00013130911065458584, "loss": 0.2666, "step": 1046 }, { "epoch": 1.3776315789473683, "grad_norm": 0.08149965352071284, "learning_rate": 0.00013116367230759415, "loss": 0.2741, "step": 1047 }, { "epoch": 1.3789473684210527, "grad_norm": 0.0800773120109665, "learning_rate": 0.00013101816091508388, "loss": 0.2621, "step": 1048 }, { "epoch": 1.3802631578947369, "grad_norm": 0.07883317602840334, "learning_rate": 0.00013087257681812376, "loss": 0.2627, "step": 1049 }, { "epoch": 1.381578947368421, "grad_norm": 0.07808540918916572, "learning_rate": 0.00013072692035795305, "loss": 0.2661, "step": 1050 }, { "epoch": 1.3828947368421054, "grad_norm": 0.07784582555862558, "learning_rate": 0.00013058119187598047, "loss": 0.2605, "step": 1051 }, { "epoch": 1.3842105263157896, "grad_norm": 0.0789115552020285, "learning_rate": 0.0001304353917137836, "loss": 0.2591, "step": 1052 }, { "epoch": 1.3855263157894737, "grad_norm": 0.07707043429238686, "learning_rate": 0.00013028952021310812, "loss": 0.2602, "step": 1053 }, { "epoch": 1.3868421052631579, "grad_norm": 0.0764917703073864, "learning_rate": 0.00013014357771586686, "loss": 0.252, "step": 1054 }, { "epoch": 1.388157894736842, "grad_norm": 0.08152267179709696, "learning_rate": 0.000129997564564139, "loss": 0.2761, "step": 1055 }, { "epoch": 1.3894736842105262, "grad_norm": 0.07488321865899004, "learning_rate": 0.00012985148110016947, "loss": 0.2557, "step": 1056 }, { "epoch": 1.3907894736842106, "grad_norm": 0.08223478003170398, "learning_rate": 0.00012970532766636787, "loss": 0.2758, "step": 1057 }, { "epoch": 1.3921052631578947, "grad_norm": 0.08146309512224036, "learning_rate": 0.00012955910460530788, "loss": 0.2631, "step": 1058 }, { "epoch": 1.393421052631579, "grad_norm": 0.08083129097727351, "learning_rate": 0.00012941281225972636, "loss": 0.2848, "step": 1059 }, { "epoch": 1.3947368421052633, "grad_norm": 0.07757651778516991, "learning_rate": 0.0001292664509725226, "loss": 0.2524, "step": 1060 }, { "epoch": 1.3960526315789474, "grad_norm": 0.0763538974448064, "learning_rate": 0.0001291200210867574, "loss": 0.257, "step": 1061 }, { "epoch": 1.3973684210526316, "grad_norm": 0.07882164295488549, "learning_rate": 0.0001289735229456525, "loss": 0.2687, "step": 1062 }, { "epoch": 1.3986842105263158, "grad_norm": 0.07892589978626854, "learning_rate": 0.0001288269568925894, "loss": 0.2704, "step": 1063 }, { "epoch": 1.4, "grad_norm": 0.07820774930633734, "learning_rate": 0.00012868032327110904, "loss": 0.2619, "step": 1064 }, { "epoch": 1.4013157894736843, "grad_norm": 0.07815937274547557, "learning_rate": 0.00012853362242491053, "loss": 0.2717, "step": 1065 }, { "epoch": 1.4026315789473685, "grad_norm": 0.07522447260364416, "learning_rate": 0.0001283868546978507, "loss": 0.2554, "step": 1066 }, { "epoch": 1.4039473684210526, "grad_norm": 0.08070192461804411, "learning_rate": 0.00012824002043394298, "loss": 0.2754, "step": 1067 }, { "epoch": 1.4052631578947368, "grad_norm": 0.0803973317378383, "learning_rate": 0.00012809311997735696, "loss": 0.2766, "step": 1068 }, { "epoch": 1.4065789473684212, "grad_norm": 0.07866656181841848, "learning_rate": 0.00012794615367241717, "loss": 0.2668, "step": 1069 }, { "epoch": 1.4078947368421053, "grad_norm": 0.0794268932038752, "learning_rate": 0.00012779912186360268, "loss": 0.2675, "step": 1070 }, { "epoch": 1.4092105263157895, "grad_norm": 0.07900705819829199, "learning_rate": 0.0001276520248955459, "loss": 0.2635, "step": 1071 }, { "epoch": 1.4105263157894736, "grad_norm": 0.07918211928299614, "learning_rate": 0.00012750486311303218, "loss": 0.262, "step": 1072 }, { "epoch": 1.4118421052631578, "grad_norm": 0.0819965398370594, "learning_rate": 0.00012735763686099862, "loss": 0.2602, "step": 1073 }, { "epoch": 1.4131578947368422, "grad_norm": 0.07831915554583863, "learning_rate": 0.00012721034648453353, "loss": 0.2628, "step": 1074 }, { "epoch": 1.4144736842105263, "grad_norm": 0.07786089842983143, "learning_rate": 0.00012706299232887543, "loss": 0.2561, "step": 1075 }, { "epoch": 1.4157894736842105, "grad_norm": 0.07971967408570595, "learning_rate": 0.00012691557473941243, "loss": 0.2656, "step": 1076 }, { "epoch": 1.4171052631578949, "grad_norm": 0.07801118181055779, "learning_rate": 0.00012676809406168133, "loss": 0.2652, "step": 1077 }, { "epoch": 1.418421052631579, "grad_norm": 0.08515813484361862, "learning_rate": 0.00012662055064136668, "loss": 0.2753, "step": 1078 }, { "epoch": 1.4197368421052632, "grad_norm": 0.08068756136226636, "learning_rate": 0.00012647294482430024, "loss": 0.2634, "step": 1079 }, { "epoch": 1.4210526315789473, "grad_norm": 0.0794228149168458, "learning_rate": 0.00012632527695645993, "loss": 0.2613, "step": 1080 }, { "epoch": 1.4223684210526315, "grad_norm": 0.07796131697715634, "learning_rate": 0.00012617754738396915, "loss": 0.2665, "step": 1081 }, { "epoch": 1.4236842105263157, "grad_norm": 0.07971630587719376, "learning_rate": 0.00012602975645309593, "loss": 0.2631, "step": 1082 }, { "epoch": 1.425, "grad_norm": 0.07712944683367323, "learning_rate": 0.00012588190451025207, "loss": 0.2509, "step": 1083 }, { "epoch": 1.4263157894736842, "grad_norm": 0.08097034232881713, "learning_rate": 0.0001257339919019925, "loss": 0.2649, "step": 1084 }, { "epoch": 1.4276315789473684, "grad_norm": 0.07618596280882184, "learning_rate": 0.0001255860189750142, "loss": 0.2613, "step": 1085 }, { "epoch": 1.4289473684210527, "grad_norm": 0.08305767349447553, "learning_rate": 0.00012543798607615565, "loss": 0.2758, "step": 1086 }, { "epoch": 1.430263157894737, "grad_norm": 0.08146614433553916, "learning_rate": 0.0001252898935523958, "loss": 0.2819, "step": 1087 }, { "epoch": 1.431578947368421, "grad_norm": 0.07805110779507855, "learning_rate": 0.00012514174175085345, "loss": 0.2649, "step": 1088 }, { "epoch": 1.4328947368421052, "grad_norm": 0.07969018389253195, "learning_rate": 0.0001249935310187863, "loss": 0.2637, "step": 1089 }, { "epoch": 1.4342105263157894, "grad_norm": 0.07956536699437335, "learning_rate": 0.00012484526170359012, "loss": 0.2651, "step": 1090 }, { "epoch": 1.4355263157894738, "grad_norm": 0.07836456115097726, "learning_rate": 0.0001246969341527981, "loss": 0.2645, "step": 1091 }, { "epoch": 1.436842105263158, "grad_norm": 0.07976260454428451, "learning_rate": 0.00012454854871407994, "loss": 0.2824, "step": 1092 }, { "epoch": 1.438157894736842, "grad_norm": 0.07726676671750074, "learning_rate": 0.00012440010573524086, "loss": 0.2632, "step": 1093 }, { "epoch": 1.4394736842105262, "grad_norm": 0.07821021819439651, "learning_rate": 0.00012425160556422114, "loss": 0.2731, "step": 1094 }, { "epoch": 1.4407894736842106, "grad_norm": 0.08013778051504493, "learning_rate": 0.00012410304854909495, "loss": 0.2559, "step": 1095 }, { "epoch": 1.4421052631578948, "grad_norm": 0.08401500576489536, "learning_rate": 0.0001239544350380699, "loss": 0.2719, "step": 1096 }, { "epoch": 1.443421052631579, "grad_norm": 0.08434562563421157, "learning_rate": 0.0001238057653794858, "loss": 0.269, "step": 1097 }, { "epoch": 1.444736842105263, "grad_norm": 0.07770950324688412, "learning_rate": 0.00012365703992181425, "loss": 0.2623, "step": 1098 }, { "epoch": 1.4460526315789473, "grad_norm": 0.08224554301801708, "learning_rate": 0.0001235082590136575, "loss": 0.2808, "step": 1099 }, { "epoch": 1.4473684210526316, "grad_norm": 0.07921911039298107, "learning_rate": 0.00012335942300374788, "loss": 0.2598, "step": 1100 }, { "epoch": 1.4486842105263158, "grad_norm": 0.08061258248097088, "learning_rate": 0.0001232105322409468, "loss": 0.2674, "step": 1101 }, { "epoch": 1.45, "grad_norm": 0.0777388780233582, "learning_rate": 0.00012306158707424403, "loss": 0.2498, "step": 1102 }, { "epoch": 1.4513157894736843, "grad_norm": 0.07794347014147819, "learning_rate": 0.00012291258785275687, "loss": 0.2569, "step": 1103 }, { "epoch": 1.4526315789473685, "grad_norm": 0.08178650020126337, "learning_rate": 0.00012276353492572935, "loss": 0.2798, "step": 1104 }, { "epoch": 1.4539473684210527, "grad_norm": 0.08037719214824916, "learning_rate": 0.0001226144286425313, "loss": 0.2684, "step": 1105 }, { "epoch": 1.4552631578947368, "grad_norm": 0.0763595502941657, "learning_rate": 0.00012246526935265768, "loss": 0.2483, "step": 1106 }, { "epoch": 1.456578947368421, "grad_norm": 0.07688959299684457, "learning_rate": 0.00012231605740572766, "loss": 0.2524, "step": 1107 }, { "epoch": 1.4578947368421051, "grad_norm": 0.07894151742409149, "learning_rate": 0.00012216679315148386, "loss": 0.2642, "step": 1108 }, { "epoch": 1.4592105263157895, "grad_norm": 0.07777205049498395, "learning_rate": 0.00012201747693979151, "loss": 0.2499, "step": 1109 }, { "epoch": 1.4605263157894737, "grad_norm": 0.08207208250132972, "learning_rate": 0.0001218681091206376, "loss": 0.2751, "step": 1110 }, { "epoch": 1.4618421052631578, "grad_norm": 0.07844860347132009, "learning_rate": 0.00012171869004413006, "loss": 0.2602, "step": 1111 }, { "epoch": 1.4631578947368422, "grad_norm": 0.07856698522588018, "learning_rate": 0.00012156922006049702, "loss": 0.2658, "step": 1112 }, { "epoch": 1.4644736842105264, "grad_norm": 0.07684231974080626, "learning_rate": 0.00012141969952008591, "loss": 0.2519, "step": 1113 }, { "epoch": 1.4657894736842105, "grad_norm": 0.07956756231471081, "learning_rate": 0.00012127012877336266, "loss": 0.2587, "step": 1114 }, { "epoch": 1.4671052631578947, "grad_norm": 0.07797778646499881, "learning_rate": 0.00012112050817091087, "loss": 0.2435, "step": 1115 }, { "epoch": 1.4684210526315788, "grad_norm": 0.07789957155018702, "learning_rate": 0.00012097083806343103, "loss": 0.2675, "step": 1116 }, { "epoch": 1.4697368421052632, "grad_norm": 0.08090700227859492, "learning_rate": 0.00012082111880173965, "loss": 0.2772, "step": 1117 }, { "epoch": 1.4710526315789474, "grad_norm": 0.07661313410176111, "learning_rate": 0.0001206713507367684, "loss": 0.2571, "step": 1118 }, { "epoch": 1.4723684210526315, "grad_norm": 0.07907971896935535, "learning_rate": 0.00012052153421956342, "loss": 0.2649, "step": 1119 }, { "epoch": 1.4736842105263157, "grad_norm": 0.07996349059046486, "learning_rate": 0.00012037166960128443, "loss": 0.2536, "step": 1120 }, { "epoch": 1.475, "grad_norm": 0.08235866500979575, "learning_rate": 0.00012022175723320381, "loss": 0.2778, "step": 1121 }, { "epoch": 1.4763157894736842, "grad_norm": 0.08262772185376702, "learning_rate": 0.00012007179746670592, "loss": 0.2717, "step": 1122 }, { "epoch": 1.4776315789473684, "grad_norm": 0.07931030156939642, "learning_rate": 0.00011992179065328618, "loss": 0.2556, "step": 1123 }, { "epoch": 1.4789473684210526, "grad_norm": 0.0818492117725006, "learning_rate": 0.00011977173714455034, "loss": 0.2734, "step": 1124 }, { "epoch": 1.4802631578947367, "grad_norm": 0.08007910950748007, "learning_rate": 0.0001196216372922136, "loss": 0.2591, "step": 1125 }, { "epoch": 1.481578947368421, "grad_norm": 0.07912376020739983, "learning_rate": 0.00011947149144809969, "loss": 0.2573, "step": 1126 }, { "epoch": 1.4828947368421053, "grad_norm": 0.07919295195700186, "learning_rate": 0.00011932129996414023, "loss": 0.2796, "step": 1127 }, { "epoch": 1.4842105263157894, "grad_norm": 0.0768053198795628, "learning_rate": 0.00011917106319237386, "loss": 0.258, "step": 1128 }, { "epoch": 1.4855263157894738, "grad_norm": 0.0751828643806364, "learning_rate": 0.00011902078148494525, "loss": 0.2482, "step": 1129 }, { "epoch": 1.486842105263158, "grad_norm": 0.07573342946120709, "learning_rate": 0.00011887045519410442, "loss": 0.2515, "step": 1130 }, { "epoch": 1.4881578947368421, "grad_norm": 0.08033169720613252, "learning_rate": 0.00011872008467220599, "loss": 0.2727, "step": 1131 }, { "epoch": 1.4894736842105263, "grad_norm": 0.07910729035555242, "learning_rate": 0.00011856967027170818, "loss": 0.2753, "step": 1132 }, { "epoch": 1.4907894736842104, "grad_norm": 0.07820164568049928, "learning_rate": 0.00011841921234517206, "loss": 0.2612, "step": 1133 }, { "epoch": 1.4921052631578946, "grad_norm": 0.07934456848043618, "learning_rate": 0.00011826871124526071, "loss": 0.2619, "step": 1134 }, { "epoch": 1.493421052631579, "grad_norm": 0.08088874374294254, "learning_rate": 0.00011811816732473841, "loss": 0.2582, "step": 1135 }, { "epoch": 1.4947368421052631, "grad_norm": 0.07877981008540878, "learning_rate": 0.00011796758093646989, "loss": 0.2548, "step": 1136 }, { "epoch": 1.4960526315789473, "grad_norm": 0.07864927828591978, "learning_rate": 0.00011781695243341932, "loss": 0.2587, "step": 1137 }, { "epoch": 1.4973684210526317, "grad_norm": 0.08496699143061613, "learning_rate": 0.0001176662821686496, "loss": 0.2769, "step": 1138 }, { "epoch": 1.4986842105263158, "grad_norm": 0.07790722036110928, "learning_rate": 0.00011751557049532153, "loss": 0.2625, "step": 1139 }, { "epoch": 1.5, "grad_norm": 0.07732825022547576, "learning_rate": 0.00011736481776669306, "loss": 0.2595, "step": 1140 }, { "epoch": 1.5013157894736842, "grad_norm": 0.07839345817113182, "learning_rate": 0.00011721402433611818, "loss": 0.264, "step": 1141 }, { "epoch": 1.5026315789473683, "grad_norm": 0.0797985656438268, "learning_rate": 0.00011706319055704642, "loss": 0.2762, "step": 1142 }, { "epoch": 1.5039473684210525, "grad_norm": 0.07859356519091114, "learning_rate": 0.00011691231678302187, "loss": 0.2589, "step": 1143 }, { "epoch": 1.5052631578947369, "grad_norm": 0.08132159915933083, "learning_rate": 0.00011676140336768236, "loss": 0.2676, "step": 1144 }, { "epoch": 1.506578947368421, "grad_norm": 0.07821055809299678, "learning_rate": 0.0001166104506647586, "loss": 0.2647, "step": 1145 }, { "epoch": 1.5078947368421054, "grad_norm": 0.07762701396147464, "learning_rate": 0.00011645945902807341, "loss": 0.2646, "step": 1146 }, { "epoch": 1.5092105263157896, "grad_norm": 0.07496175871914876, "learning_rate": 0.00011630842881154085, "loss": 0.2576, "step": 1147 }, { "epoch": 1.5105263157894737, "grad_norm": 0.0784666282944514, "learning_rate": 0.00011615736036916549, "loss": 0.2679, "step": 1148 }, { "epoch": 1.5118421052631579, "grad_norm": 0.07722067192546046, "learning_rate": 0.0001160062540550414, "loss": 0.2639, "step": 1149 }, { "epoch": 1.513157894736842, "grad_norm": 0.08059485240498777, "learning_rate": 0.00011585511022335142, "loss": 0.2764, "step": 1150 }, { "epoch": 1.5144736842105262, "grad_norm": 0.08129025875363483, "learning_rate": 0.00011570392922836644, "loss": 0.2714, "step": 1151 }, { "epoch": 1.5157894736842106, "grad_norm": 0.07486275958049285, "learning_rate": 0.00011555271142444433, "loss": 0.2517, "step": 1152 }, { "epoch": 1.5171052631578947, "grad_norm": 0.08143946348918624, "learning_rate": 0.0001154014571660293, "loss": 0.2747, "step": 1153 }, { "epoch": 1.518421052631579, "grad_norm": 0.07923671682954142, "learning_rate": 0.00011525016680765102, "loss": 0.2485, "step": 1154 }, { "epoch": 1.5197368421052633, "grad_norm": 0.08011623724583099, "learning_rate": 0.00011509884070392369, "loss": 0.2656, "step": 1155 }, { "epoch": 1.5210526315789474, "grad_norm": 0.07946415372441243, "learning_rate": 0.00011494747920954545, "loss": 0.2572, "step": 1156 }, { "epoch": 1.5223684210526316, "grad_norm": 0.0812375151138501, "learning_rate": 0.00011479608267929722, "loss": 0.2697, "step": 1157 }, { "epoch": 1.5236842105263158, "grad_norm": 0.07578132261810436, "learning_rate": 0.00011464465146804217, "loss": 0.2411, "step": 1158 }, { "epoch": 1.525, "grad_norm": 0.07784132005694626, "learning_rate": 0.00011449318593072466, "loss": 0.2678, "step": 1159 }, { "epoch": 1.526315789473684, "grad_norm": 0.07909385203173394, "learning_rate": 0.00011434168642236964, "loss": 0.2754, "step": 1160 }, { "epoch": 1.5276315789473685, "grad_norm": 0.0776880493894959, "learning_rate": 0.00011419015329808157, "loss": 0.2655, "step": 1161 }, { "epoch": 1.5289473684210526, "grad_norm": 0.07765848299877728, "learning_rate": 0.00011403858691304373, "loss": 0.2607, "step": 1162 }, { "epoch": 1.530263157894737, "grad_norm": 0.07646804543626935, "learning_rate": 0.00011388698762251732, "loss": 0.2596, "step": 1163 }, { "epoch": 1.5315789473684212, "grad_norm": 0.07740933938489439, "learning_rate": 0.00011373535578184082, "loss": 0.2707, "step": 1164 }, { "epoch": 1.5328947368421053, "grad_norm": 0.07996711741187337, "learning_rate": 0.00011358369174642887, "loss": 0.2664, "step": 1165 }, { "epoch": 1.5342105263157895, "grad_norm": 0.07987614614098584, "learning_rate": 0.00011343199587177155, "loss": 0.2714, "step": 1166 }, { "epoch": 1.5355263157894736, "grad_norm": 0.07887783815136196, "learning_rate": 0.00011328026851343367, "loss": 0.2548, "step": 1167 }, { "epoch": 1.5368421052631578, "grad_norm": 0.07775100523637381, "learning_rate": 0.00011312851002705383, "loss": 0.2587, "step": 1168 }, { "epoch": 1.538157894736842, "grad_norm": 0.08179602367804785, "learning_rate": 0.00011297672076834348, "loss": 0.2743, "step": 1169 }, { "epoch": 1.5394736842105263, "grad_norm": 0.08166670313368278, "learning_rate": 0.00011282490109308633, "loss": 0.2743, "step": 1170 }, { "epoch": 1.5407894736842105, "grad_norm": 0.07926586858824067, "learning_rate": 0.00011267305135713726, "loss": 0.2532, "step": 1171 }, { "epoch": 1.5421052631578949, "grad_norm": 0.07935859462462387, "learning_rate": 0.00011252117191642175, "loss": 0.2586, "step": 1172 }, { "epoch": 1.543421052631579, "grad_norm": 0.07771700777076371, "learning_rate": 0.00011236926312693479, "loss": 0.2556, "step": 1173 }, { "epoch": 1.5447368421052632, "grad_norm": 0.07944739071127381, "learning_rate": 0.00011221732534474019, "loss": 0.2669, "step": 1174 }, { "epoch": 1.5460526315789473, "grad_norm": 0.07926172086790113, "learning_rate": 0.00011206535892596975, "loss": 0.2578, "step": 1175 }, { "epoch": 1.5473684210526315, "grad_norm": 0.07836482244209485, "learning_rate": 0.00011191336422682237, "loss": 0.266, "step": 1176 }, { "epoch": 1.5486842105263157, "grad_norm": 0.08060052790786207, "learning_rate": 0.00011176134160356327, "loss": 0.27, "step": 1177 }, { "epoch": 1.55, "grad_norm": 0.07857242666783919, "learning_rate": 0.00011160929141252303, "loss": 0.2561, "step": 1178 }, { "epoch": 1.5513157894736842, "grad_norm": 0.0762732065222906, "learning_rate": 0.00011145721401009694, "loss": 0.265, "step": 1179 }, { "epoch": 1.5526315789473686, "grad_norm": 0.07588799278326497, "learning_rate": 0.00011130510975274409, "loss": 0.2659, "step": 1180 }, { "epoch": 1.5539473684210527, "grad_norm": 0.0782353560349947, "learning_rate": 0.00011115297899698638, "loss": 0.2622, "step": 1181 }, { "epoch": 1.555263157894737, "grad_norm": 0.0779137347020748, "learning_rate": 0.00011100082209940795, "loss": 0.2581, "step": 1182 }, { "epoch": 1.556578947368421, "grad_norm": 0.07626200398110206, "learning_rate": 0.00011084863941665415, "loss": 0.2525, "step": 1183 }, { "epoch": 1.5578947368421052, "grad_norm": 0.08075601397948919, "learning_rate": 0.00011069643130543084, "loss": 0.2589, "step": 1184 }, { "epoch": 1.5592105263157894, "grad_norm": 0.07890719633469666, "learning_rate": 0.00011054419812250338, "loss": 0.2609, "step": 1185 }, { "epoch": 1.5605263157894735, "grad_norm": 0.0789865706647385, "learning_rate": 0.00011039194022469597, "loss": 0.2515, "step": 1186 }, { "epoch": 1.561842105263158, "grad_norm": 0.0796846545034331, "learning_rate": 0.0001102396579688907, "loss": 0.2687, "step": 1187 }, { "epoch": 1.563157894736842, "grad_norm": 0.07938699909977037, "learning_rate": 0.00011008735171202684, "loss": 0.2636, "step": 1188 }, { "epoch": 1.5644736842105265, "grad_norm": 0.0810811457705607, "learning_rate": 0.00010993502181109978, "loss": 0.2544, "step": 1189 }, { "epoch": 1.5657894736842106, "grad_norm": 0.0811343931888347, "learning_rate": 0.0001097826686231604, "loss": 0.274, "step": 1190 }, { "epoch": 1.5671052631578948, "grad_norm": 0.0786799724878539, "learning_rate": 0.00010963029250531418, "loss": 0.2619, "step": 1191 }, { "epoch": 1.568421052631579, "grad_norm": 0.07801305259558834, "learning_rate": 0.00010947789381472035, "loss": 0.2575, "step": 1192 }, { "epoch": 1.569736842105263, "grad_norm": 0.08019971785855351, "learning_rate": 0.00010932547290859103, "loss": 0.2592, "step": 1193 }, { "epoch": 1.5710526315789473, "grad_norm": 0.07693908653004418, "learning_rate": 0.00010917303014419036, "loss": 0.2665, "step": 1194 }, { "epoch": 1.5723684210526314, "grad_norm": 0.07646993423396128, "learning_rate": 0.00010902056587883378, "loss": 0.245, "step": 1195 }, { "epoch": 1.5736842105263158, "grad_norm": 0.080428348690664, "learning_rate": 0.00010886808046988717, "loss": 0.2588, "step": 1196 }, { "epoch": 1.575, "grad_norm": 0.07877667825439123, "learning_rate": 0.00010871557427476583, "loss": 0.2688, "step": 1197 }, { "epoch": 1.5763157894736843, "grad_norm": 0.07769524509043618, "learning_rate": 0.0001085630476509339, "loss": 0.2588, "step": 1198 }, { "epoch": 1.5776315789473685, "grad_norm": 0.08089328257237792, "learning_rate": 0.00010841050095590335, "loss": 0.2643, "step": 1199 }, { "epoch": 1.5789473684210527, "grad_norm": 0.07938294687717415, "learning_rate": 0.00010825793454723325, "loss": 0.2678, "step": 1200 }, { "epoch": 1.5802631578947368, "grad_norm": 0.07853030205890081, "learning_rate": 0.0001081053487825288, "loss": 0.2606, "step": 1201 }, { "epoch": 1.581578947368421, "grad_norm": 0.07774175526579997, "learning_rate": 0.00010795274401944058, "loss": 0.2582, "step": 1202 }, { "epoch": 1.5828947368421051, "grad_norm": 0.07578431858186546, "learning_rate": 0.00010780012061566378, "loss": 0.2568, "step": 1203 }, { "epoch": 1.5842105263157895, "grad_norm": 0.07606000180404633, "learning_rate": 0.00010764747892893723, "loss": 0.2646, "step": 1204 }, { "epoch": 1.5855263157894737, "grad_norm": 0.07591132081458318, "learning_rate": 0.0001074948193170426, "loss": 0.2517, "step": 1205 }, { "epoch": 1.586842105263158, "grad_norm": 0.0781464844924248, "learning_rate": 0.00010734214213780354, "loss": 0.2491, "step": 1206 }, { "epoch": 1.5881578947368422, "grad_norm": 0.07902701850224841, "learning_rate": 0.000107189447749085, "loss": 0.2653, "step": 1207 }, { "epoch": 1.5894736842105264, "grad_norm": 0.08084483037179403, "learning_rate": 0.00010703673650879218, "loss": 0.2625, "step": 1208 }, { "epoch": 1.5907894736842105, "grad_norm": 0.0773678893977578, "learning_rate": 0.00010688400877486978, "loss": 0.2563, "step": 1209 }, { "epoch": 1.5921052631578947, "grad_norm": 0.07876897624596318, "learning_rate": 0.00010673126490530112, "loss": 0.2649, "step": 1210 }, { "epoch": 1.5934210526315788, "grad_norm": 0.0804198373742344, "learning_rate": 0.00010657850525810748, "loss": 0.268, "step": 1211 }, { "epoch": 1.594736842105263, "grad_norm": 0.08109356740988712, "learning_rate": 0.00010642573019134703, "loss": 0.2634, "step": 1212 }, { "epoch": 1.5960526315789474, "grad_norm": 0.08129604963653887, "learning_rate": 0.00010627294006311404, "loss": 0.2852, "step": 1213 }, { "epoch": 1.5973684210526315, "grad_norm": 0.07643522766740891, "learning_rate": 0.00010612013523153812, "loss": 0.261, "step": 1214 }, { "epoch": 1.598684210526316, "grad_norm": 0.07695348511929585, "learning_rate": 0.0001059673160547834, "loss": 0.2651, "step": 1215 }, { "epoch": 1.6, "grad_norm": 0.07630900188908298, "learning_rate": 0.00010581448289104758, "loss": 0.2633, "step": 1216 }, { "epoch": 1.6013157894736842, "grad_norm": 0.07437930134164347, "learning_rate": 0.00010566163609856117, "loss": 0.2575, "step": 1217 }, { "epoch": 1.6026315789473684, "grad_norm": 0.0780918664132209, "learning_rate": 0.00010550877603558655, "loss": 0.2647, "step": 1218 }, { "epoch": 1.6039473684210526, "grad_norm": 0.07895904000053701, "learning_rate": 0.00010535590306041732, "loss": 0.2657, "step": 1219 }, { "epoch": 1.6052631578947367, "grad_norm": 0.0782823182484623, "learning_rate": 0.00010520301753137724, "loss": 0.2539, "step": 1220 }, { "epoch": 1.606578947368421, "grad_norm": 0.07836129489018223, "learning_rate": 0.00010505011980681962, "loss": 0.2623, "step": 1221 }, { "epoch": 1.6078947368421053, "grad_norm": 0.07921294679317484, "learning_rate": 0.00010489721024512618, "loss": 0.2574, "step": 1222 }, { "epoch": 1.6092105263157894, "grad_norm": 0.08253466147497233, "learning_rate": 0.00010474428920470654, "loss": 0.2626, "step": 1223 }, { "epoch": 1.6105263157894738, "grad_norm": 0.0783379793278089, "learning_rate": 0.00010459135704399718, "loss": 0.2565, "step": 1224 }, { "epoch": 1.611842105263158, "grad_norm": 0.0789556340245482, "learning_rate": 0.00010443841412146065, "loss": 0.2732, "step": 1225 }, { "epoch": 1.6131578947368421, "grad_norm": 0.08284705629363337, "learning_rate": 0.00010428546079558463, "loss": 0.2784, "step": 1226 }, { "epoch": 1.6144736842105263, "grad_norm": 0.07926378933409738, "learning_rate": 0.00010413249742488131, "loss": 0.2626, "step": 1227 }, { "epoch": 1.6157894736842104, "grad_norm": 0.07646962136820679, "learning_rate": 0.00010397952436788642, "loss": 0.2576, "step": 1228 }, { "epoch": 1.6171052631578946, "grad_norm": 0.07459202591852203, "learning_rate": 0.00010382654198315834, "loss": 0.2544, "step": 1229 }, { "epoch": 1.618421052631579, "grad_norm": 0.07728933092230707, "learning_rate": 0.00010367355062927726, "loss": 0.2597, "step": 1230 }, { "epoch": 1.6197368421052631, "grad_norm": 0.0740833624188333, "learning_rate": 0.00010352055066484449, "loss": 0.2415, "step": 1231 }, { "epoch": 1.6210526315789475, "grad_norm": 0.07917560120385768, "learning_rate": 0.00010336754244848157, "loss": 0.2615, "step": 1232 }, { "epoch": 1.6223684210526317, "grad_norm": 0.08063325789327866, "learning_rate": 0.00010321452633882922, "loss": 0.2741, "step": 1233 }, { "epoch": 1.6236842105263158, "grad_norm": 0.07913725813766033, "learning_rate": 0.00010306150269454675, "loss": 0.2658, "step": 1234 }, { "epoch": 1.625, "grad_norm": 0.07936038810014337, "learning_rate": 0.00010290847187431113, "loss": 0.2618, "step": 1235 }, { "epoch": 1.6263157894736842, "grad_norm": 0.07924213689691284, "learning_rate": 0.00010275543423681621, "loss": 0.2614, "step": 1236 }, { "epoch": 1.6276315789473683, "grad_norm": 0.0771038728604427, "learning_rate": 0.0001026023901407717, "loss": 0.2477, "step": 1237 }, { "epoch": 1.6289473684210525, "grad_norm": 0.0770735907526327, "learning_rate": 0.00010244933994490249, "loss": 0.2548, "step": 1238 }, { "epoch": 1.6302631578947369, "grad_norm": 0.07674015928306312, "learning_rate": 0.0001022962840079478, "loss": 0.2547, "step": 1239 }, { "epoch": 1.631578947368421, "grad_norm": 0.07857136632734509, "learning_rate": 0.00010214322268866032, "loss": 0.2547, "step": 1240 }, { "epoch": 1.6328947368421054, "grad_norm": 0.08094746714498062, "learning_rate": 0.00010199015634580528, "loss": 0.2612, "step": 1241 }, { "epoch": 1.6342105263157896, "grad_norm": 0.07859907086130548, "learning_rate": 0.00010183708533815974, "loss": 0.2629, "step": 1242 }, { "epoch": 1.6355263157894737, "grad_norm": 0.07847032246484141, "learning_rate": 0.0001016840100245117, "loss": 0.2587, "step": 1243 }, { "epoch": 1.6368421052631579, "grad_norm": 0.0785820431028293, "learning_rate": 0.00010153093076365923, "loss": 0.2562, "step": 1244 }, { "epoch": 1.638157894736842, "grad_norm": 0.07896059641278598, "learning_rate": 0.00010137784791440965, "loss": 0.2572, "step": 1245 }, { "epoch": 1.6394736842105262, "grad_norm": 0.07921831824560291, "learning_rate": 0.00010122476183557869, "loss": 0.2727, "step": 1246 }, { "epoch": 1.6407894736842106, "grad_norm": 0.07588261624391224, "learning_rate": 0.00010107167288598967, "loss": 0.2505, "step": 1247 }, { "epoch": 1.6421052631578947, "grad_norm": 0.07634558208956514, "learning_rate": 0.00010091858142447265, "loss": 0.2536, "step": 1248 }, { "epoch": 1.643421052631579, "grad_norm": 0.07689177759409037, "learning_rate": 0.00010076548780986352, "loss": 0.2603, "step": 1249 }, { "epoch": 1.6447368421052633, "grad_norm": 0.07734671917849004, "learning_rate": 0.00010061239240100327, "loss": 0.2473, "step": 1250 }, { "epoch": 1.6460526315789474, "grad_norm": 0.07734454423729424, "learning_rate": 0.00010045929555673705, "loss": 0.2687, "step": 1251 }, { "epoch": 1.6473684210526316, "grad_norm": 0.0778519121851718, "learning_rate": 0.00010030619763591347, "loss": 0.2659, "step": 1252 }, { "epoch": 1.6486842105263158, "grad_norm": 0.07985594655530279, "learning_rate": 0.00010015309899738355, "loss": 0.269, "step": 1253 }, { "epoch": 1.65, "grad_norm": 0.07695791648645144, "learning_rate": 0.0001, "loss": 0.2552, "step": 1254 }, { "epoch": 1.651315789473684, "grad_norm": 0.07817327760226638, "learning_rate": 9.984690100261648e-05, "loss": 0.2493, "step": 1255 }, { "epoch": 1.6526315789473685, "grad_norm": 0.07787762754684492, "learning_rate": 9.969380236408656e-05, "loss": 0.2553, "step": 1256 }, { "epoch": 1.6539473684210526, "grad_norm": 0.08135305782817635, "learning_rate": 9.954070444326293e-05, "loss": 0.2505, "step": 1257 }, { "epoch": 1.655263157894737, "grad_norm": 0.07789141129556089, "learning_rate": 9.938760759899674e-05, "loss": 0.2527, "step": 1258 }, { "epoch": 1.6565789473684212, "grad_norm": 0.07824485146455672, "learning_rate": 9.923451219013651e-05, "loss": 0.2603, "step": 1259 }, { "epoch": 1.6578947368421053, "grad_norm": 0.07858931038856287, "learning_rate": 9.908141857552737e-05, "loss": 0.2631, "step": 1260 }, { "epoch": 1.6592105263157895, "grad_norm": 0.07644569879074736, "learning_rate": 9.892832711401036e-05, "loss": 0.2436, "step": 1261 }, { "epoch": 1.6605263157894736, "grad_norm": 0.08015020052581098, "learning_rate": 9.877523816442133e-05, "loss": 0.2733, "step": 1262 }, { "epoch": 1.6618421052631578, "grad_norm": 0.07973057338959542, "learning_rate": 9.862215208559037e-05, "loss": 0.2595, "step": 1263 }, { "epoch": 1.663157894736842, "grad_norm": 0.07569295606836161, "learning_rate": 9.846906923634079e-05, "loss": 0.2425, "step": 1264 }, { "epoch": 1.6644736842105263, "grad_norm": 0.07591010855329165, "learning_rate": 9.831598997548831e-05, "loss": 0.247, "step": 1265 }, { "epoch": 1.6657894736842105, "grad_norm": 0.07750100103712705, "learning_rate": 9.816291466184026e-05, "loss": 0.2475, "step": 1266 }, { "epoch": 1.6671052631578949, "grad_norm": 0.07706201585723965, "learning_rate": 9.800984365419475e-05, "loss": 0.2611, "step": 1267 }, { "epoch": 1.668421052631579, "grad_norm": 0.07793486034207057, "learning_rate": 9.78567773113397e-05, "loss": 0.263, "step": 1268 }, { "epoch": 1.6697368421052632, "grad_norm": 0.079099268943506, "learning_rate": 9.770371599205222e-05, "loss": 0.2578, "step": 1269 }, { "epoch": 1.6710526315789473, "grad_norm": 0.07871797278447953, "learning_rate": 9.755066005509753e-05, "loss": 0.2578, "step": 1270 }, { "epoch": 1.6723684210526315, "grad_norm": 0.0811458532081561, "learning_rate": 9.739760985922832e-05, "loss": 0.2579, "step": 1271 }, { "epoch": 1.6736842105263157, "grad_norm": 0.07824873316859218, "learning_rate": 9.724456576318381e-05, "loss": 0.2754, "step": 1272 }, { "epoch": 1.675, "grad_norm": 0.07647687339497086, "learning_rate": 9.709152812568886e-05, "loss": 0.2575, "step": 1273 }, { "epoch": 1.6763157894736842, "grad_norm": 0.07842920421047767, "learning_rate": 9.693849730545326e-05, "loss": 0.248, "step": 1274 }, { "epoch": 1.6776315789473686, "grad_norm": 0.0779966215397614, "learning_rate": 9.678547366117083e-05, "loss": 0.2669, "step": 1275 }, { "epoch": 1.6789473684210527, "grad_norm": 0.07565197476267557, "learning_rate": 9.663245755151846e-05, "loss": 0.2553, "step": 1276 }, { "epoch": 1.680263157894737, "grad_norm": 0.07727410664675037, "learning_rate": 9.647944933515552e-05, "loss": 0.2528, "step": 1277 }, { "epoch": 1.681578947368421, "grad_norm": 0.076100890561838, "learning_rate": 9.632644937072277e-05, "loss": 0.2547, "step": 1278 }, { "epoch": 1.6828947368421052, "grad_norm": 0.07998682208123038, "learning_rate": 9.617345801684169e-05, "loss": 0.2606, "step": 1279 }, { "epoch": 1.6842105263157894, "grad_norm": 0.0766101279713053, "learning_rate": 9.602047563211359e-05, "loss": 0.2706, "step": 1280 }, { "epoch": 1.6855263157894735, "grad_norm": 0.07714573829749818, "learning_rate": 9.586750257511867e-05, "loss": 0.2533, "step": 1281 }, { "epoch": 1.686842105263158, "grad_norm": 0.0774718178209979, "learning_rate": 9.571453920441538e-05, "loss": 0.2541, "step": 1282 }, { "epoch": 1.688157894736842, "grad_norm": 0.07724082139457383, "learning_rate": 9.556158587853941e-05, "loss": 0.2548, "step": 1283 }, { "epoch": 1.6894736842105265, "grad_norm": 0.08138472870698339, "learning_rate": 9.540864295600283e-05, "loss": 0.2767, "step": 1284 }, { "epoch": 1.6907894736842106, "grad_norm": 0.07668384972603022, "learning_rate": 9.525571079529347e-05, "loss": 0.2614, "step": 1285 }, { "epoch": 1.6921052631578948, "grad_norm": 0.07631635135059317, "learning_rate": 9.510278975487384e-05, "loss": 0.2568, "step": 1286 }, { "epoch": 1.693421052631579, "grad_norm": 0.07762293294175955, "learning_rate": 9.49498801931804e-05, "loss": 0.2608, "step": 1287 }, { "epoch": 1.694736842105263, "grad_norm": 0.08091630947772432, "learning_rate": 9.479698246862276e-05, "loss": 0.2624, "step": 1288 }, { "epoch": 1.6960526315789473, "grad_norm": 0.07844980001770621, "learning_rate": 9.464409693958269e-05, "loss": 0.2589, "step": 1289 }, { "epoch": 1.6973684210526314, "grad_norm": 0.07835999657865879, "learning_rate": 9.449122396441345e-05, "loss": 0.2673, "step": 1290 }, { "epoch": 1.6986842105263158, "grad_norm": 0.07660019194402308, "learning_rate": 9.433836390143887e-05, "loss": 0.2533, "step": 1291 }, { "epoch": 1.7, "grad_norm": 0.0775129862715817, "learning_rate": 9.418551710895243e-05, "loss": 0.2526, "step": 1292 }, { "epoch": 1.7013157894736843, "grad_norm": 0.07852001442550643, "learning_rate": 9.403268394521662e-05, "loss": 0.2536, "step": 1293 }, { "epoch": 1.7026315789473685, "grad_norm": 0.07775416745660803, "learning_rate": 9.38798647684619e-05, "loss": 0.2568, "step": 1294 }, { "epoch": 1.7039473684210527, "grad_norm": 0.0773123834839056, "learning_rate": 9.372705993688599e-05, "loss": 0.2487, "step": 1295 }, { "epoch": 1.7052631578947368, "grad_norm": 0.07933332672618959, "learning_rate": 9.357426980865301e-05, "loss": 0.2546, "step": 1296 }, { "epoch": 1.706578947368421, "grad_norm": 0.0758849825907164, "learning_rate": 9.342149474189251e-05, "loss": 0.2527, "step": 1297 }, { "epoch": 1.7078947368421051, "grad_norm": 0.07768939168993196, "learning_rate": 9.326873509469887e-05, "loss": 0.2635, "step": 1298 }, { "epoch": 1.7092105263157895, "grad_norm": 0.07733832118219917, "learning_rate": 9.311599122513029e-05, "loss": 0.2639, "step": 1299 }, { "epoch": 1.7105263157894737, "grad_norm": 0.07871573341153681, "learning_rate": 9.296326349120785e-05, "loss": 0.2618, "step": 1300 }, { "epoch": 1.711842105263158, "grad_norm": 0.07866153909260622, "learning_rate": 9.281055225091503e-05, "loss": 0.2594, "step": 1301 }, { "epoch": 1.7131578947368422, "grad_norm": 0.07974140917973452, "learning_rate": 9.265785786219647e-05, "loss": 0.2668, "step": 1302 }, { "epoch": 1.7144736842105264, "grad_norm": 0.08096294253599645, "learning_rate": 9.250518068295744e-05, "loss": 0.2652, "step": 1303 }, { "epoch": 1.7157894736842105, "grad_norm": 0.0790015551619781, "learning_rate": 9.235252107106279e-05, "loss": 0.2657, "step": 1304 }, { "epoch": 1.7171052631578947, "grad_norm": 0.07763615541336244, "learning_rate": 9.219987938433621e-05, "loss": 0.2629, "step": 1305 }, { "epoch": 1.7184210526315788, "grad_norm": 0.07778669559237536, "learning_rate": 9.204725598055942e-05, "loss": 0.2636, "step": 1306 }, { "epoch": 1.719736842105263, "grad_norm": 0.07939255811714764, "learning_rate": 9.189465121747125e-05, "loss": 0.2705, "step": 1307 }, { "epoch": 1.7210526315789474, "grad_norm": 0.07960488700289318, "learning_rate": 9.174206545276677e-05, "loss": 0.2706, "step": 1308 }, { "epoch": 1.7223684210526315, "grad_norm": 0.07723537042473928, "learning_rate": 9.158949904409668e-05, "loss": 0.2621, "step": 1309 }, { "epoch": 1.723684210526316, "grad_norm": 0.07861687285953221, "learning_rate": 9.143695234906611e-05, "loss": 0.2549, "step": 1310 }, { "epoch": 1.725, "grad_norm": 0.07928259749395238, "learning_rate": 9.128442572523417e-05, "loss": 0.256, "step": 1311 }, { "epoch": 1.7263157894736842, "grad_norm": 0.07977907458496993, "learning_rate": 9.113191953011287e-05, "loss": 0.2621, "step": 1312 }, { "epoch": 1.7276315789473684, "grad_norm": 0.07908599126862541, "learning_rate": 9.09794341211662e-05, "loss": 0.2577, "step": 1313 }, { "epoch": 1.7289473684210526, "grad_norm": 0.07804526868832733, "learning_rate": 9.082696985580964e-05, "loss": 0.2574, "step": 1314 }, { "epoch": 1.7302631578947367, "grad_norm": 0.07671319689683427, "learning_rate": 9.0674527091409e-05, "loss": 0.2551, "step": 1315 }, { "epoch": 1.731578947368421, "grad_norm": 0.07918675981206362, "learning_rate": 9.052210618527966e-05, "loss": 0.2478, "step": 1316 }, { "epoch": 1.7328947368421053, "grad_norm": 0.07700717313469471, "learning_rate": 9.036970749468584e-05, "loss": 0.2484, "step": 1317 }, { "epoch": 1.7342105263157894, "grad_norm": 0.07832287153888227, "learning_rate": 9.021733137683962e-05, "loss": 0.2604, "step": 1318 }, { "epoch": 1.7355263157894738, "grad_norm": 0.076498834528713, "learning_rate": 9.006497818890024e-05, "loss": 0.2536, "step": 1319 }, { "epoch": 1.736842105263158, "grad_norm": 0.077411644809103, "learning_rate": 8.991264828797319e-05, "loss": 0.2578, "step": 1320 }, { "epoch": 1.7381578947368421, "grad_norm": 0.07747513215833557, "learning_rate": 8.97603420311093e-05, "loss": 0.2574, "step": 1321 }, { "epoch": 1.7394736842105263, "grad_norm": 0.07842803983473304, "learning_rate": 8.960805977530404e-05, "loss": 0.2606, "step": 1322 }, { "epoch": 1.7407894736842104, "grad_norm": 0.07436646379437782, "learning_rate": 8.945580187749666e-05, "loss": 0.2487, "step": 1323 }, { "epoch": 1.7421052631578946, "grad_norm": 0.07487978241239661, "learning_rate": 8.930356869456919e-05, "loss": 0.2532, "step": 1324 }, { "epoch": 1.743421052631579, "grad_norm": 0.07966307306554148, "learning_rate": 8.915136058334588e-05, "loss": 0.2665, "step": 1325 }, { "epoch": 1.7447368421052631, "grad_norm": 0.08106464265363358, "learning_rate": 8.899917790059208e-05, "loss": 0.2643, "step": 1326 }, { "epoch": 1.7460526315789475, "grad_norm": 0.07593234358817623, "learning_rate": 8.884702100301364e-05, "loss": 0.2586, "step": 1327 }, { "epoch": 1.7473684210526317, "grad_norm": 0.07609912132517359, "learning_rate": 8.869489024725595e-05, "loss": 0.2459, "step": 1328 }, { "epoch": 1.7486842105263158, "grad_norm": 0.07772499392839312, "learning_rate": 8.854278598990305e-05, "loss": 0.2632, "step": 1329 }, { "epoch": 1.75, "grad_norm": 0.07646456091007017, "learning_rate": 8.839070858747697e-05, "loss": 0.2526, "step": 1330 }, { "epoch": 1.7513157894736842, "grad_norm": 0.07793051011581553, "learning_rate": 8.823865839643677e-05, "loss": 0.2714, "step": 1331 }, { "epoch": 1.7526315789473683, "grad_norm": 0.07790374700766114, "learning_rate": 8.808663577317764e-05, "loss": 0.2605, "step": 1332 }, { "epoch": 1.7539473684210525, "grad_norm": 0.0800107435876805, "learning_rate": 8.793464107403028e-05, "loss": 0.2595, "step": 1333 }, { "epoch": 1.7552631578947369, "grad_norm": 0.08006102878272488, "learning_rate": 8.778267465525985e-05, "loss": 0.2667, "step": 1334 }, { "epoch": 1.756578947368421, "grad_norm": 0.07794460119809767, "learning_rate": 8.763073687306524e-05, "loss": 0.2506, "step": 1335 }, { "epoch": 1.7578947368421054, "grad_norm": 0.07759219048166072, "learning_rate": 8.747882808357828e-05, "loss": 0.2705, "step": 1336 }, { "epoch": 1.7592105263157896, "grad_norm": 0.07712939424044654, "learning_rate": 8.732694864286273e-05, "loss": 0.2635, "step": 1337 }, { "epoch": 1.7605263157894737, "grad_norm": 0.07743375242887887, "learning_rate": 8.717509890691368e-05, "loss": 0.2587, "step": 1338 }, { "epoch": 1.7618421052631579, "grad_norm": 0.07860695060081709, "learning_rate": 8.702327923165654e-05, "loss": 0.2661, "step": 1339 }, { "epoch": 1.763157894736842, "grad_norm": 0.0792831934996456, "learning_rate": 8.687148997294621e-05, "loss": 0.2678, "step": 1340 }, { "epoch": 1.7644736842105262, "grad_norm": 0.07976560061008468, "learning_rate": 8.671973148656634e-05, "loss": 0.2619, "step": 1341 }, { "epoch": 1.7657894736842106, "grad_norm": 0.08106987042463963, "learning_rate": 8.656800412822847e-05, "loss": 0.2735, "step": 1342 }, { "epoch": 1.7671052631578947, "grad_norm": 0.07818392747232804, "learning_rate": 8.641630825357115e-05, "loss": 0.2387, "step": 1343 }, { "epoch": 1.768421052631579, "grad_norm": 0.07902258148724874, "learning_rate": 8.626464421815919e-05, "loss": 0.2524, "step": 1344 }, { "epoch": 1.7697368421052633, "grad_norm": 0.07756812342250517, "learning_rate": 8.611301237748267e-05, "loss": 0.2621, "step": 1345 }, { "epoch": 1.7710526315789474, "grad_norm": 0.07885482619098905, "learning_rate": 8.596141308695628e-05, "loss": 0.2544, "step": 1346 }, { "epoch": 1.7723684210526316, "grad_norm": 0.08005110335934358, "learning_rate": 8.580984670191848e-05, "loss": 0.2539, "step": 1347 }, { "epoch": 1.7736842105263158, "grad_norm": 0.07943085562482573, "learning_rate": 8.565831357763039e-05, "loss": 0.2622, "step": 1348 }, { "epoch": 1.775, "grad_norm": 0.08121598066546899, "learning_rate": 8.550681406927535e-05, "loss": 0.2658, "step": 1349 }, { "epoch": 1.776315789473684, "grad_norm": 0.0773683018126476, "learning_rate": 8.535534853195786e-05, "loss": 0.2629, "step": 1350 }, { "epoch": 1.7776315789473685, "grad_norm": 0.07743762444071942, "learning_rate": 8.520391732070279e-05, "loss": 0.2477, "step": 1351 }, { "epoch": 1.7789473684210526, "grad_norm": 0.07738788021837957, "learning_rate": 8.505252079045458e-05, "loss": 0.2617, "step": 1352 }, { "epoch": 1.780263157894737, "grad_norm": 0.07732362511743367, "learning_rate": 8.490115929607631e-05, "loss": 0.2562, "step": 1353 }, { "epoch": 1.7815789473684212, "grad_norm": 0.07882812579817991, "learning_rate": 8.474983319234899e-05, "loss": 0.26, "step": 1354 }, { "epoch": 1.7828947368421053, "grad_norm": 0.0764856203909363, "learning_rate": 8.459854283397073e-05, "loss": 0.2561, "step": 1355 }, { "epoch": 1.7842105263157895, "grad_norm": 0.07723245518951646, "learning_rate": 8.444728857555572e-05, "loss": 0.2504, "step": 1356 }, { "epoch": 1.7855263157894736, "grad_norm": 0.07746876461441092, "learning_rate": 8.42960707716336e-05, "loss": 0.247, "step": 1357 }, { "epoch": 1.7868421052631578, "grad_norm": 0.07569577111860483, "learning_rate": 8.414488977664859e-05, "loss": 0.2409, "step": 1358 }, { "epoch": 1.788157894736842, "grad_norm": 0.07919043138051154, "learning_rate": 8.399374594495861e-05, "loss": 0.2483, "step": 1359 }, { "epoch": 1.7894736842105263, "grad_norm": 0.08072885452301694, "learning_rate": 8.384263963083453e-05, "loss": 0.2527, "step": 1360 }, { "epoch": 1.7907894736842105, "grad_norm": 0.07843210339508665, "learning_rate": 8.369157118845914e-05, "loss": 0.2538, "step": 1361 }, { "epoch": 1.7921052631578949, "grad_norm": 0.08063967474828551, "learning_rate": 8.35405409719266e-05, "loss": 0.2619, "step": 1362 }, { "epoch": 1.793421052631579, "grad_norm": 0.07796033453134701, "learning_rate": 8.338954933524144e-05, "loss": 0.253, "step": 1363 }, { "epoch": 1.7947368421052632, "grad_norm": 0.08080704163282365, "learning_rate": 8.323859663231768e-05, "loss": 0.2691, "step": 1364 }, { "epoch": 1.7960526315789473, "grad_norm": 0.0786006883745561, "learning_rate": 8.308768321697815e-05, "loss": 0.263, "step": 1365 }, { "epoch": 1.7973684210526315, "grad_norm": 0.07705721459554137, "learning_rate": 8.293680944295359e-05, "loss": 0.2486, "step": 1366 }, { "epoch": 1.7986842105263157, "grad_norm": 0.07751983519025603, "learning_rate": 8.278597566388184e-05, "loss": 0.2578, "step": 1367 }, { "epoch": 1.8, "grad_norm": 0.07892468926699212, "learning_rate": 8.263518223330697e-05, "loss": 0.2503, "step": 1368 }, { "epoch": 1.8013157894736842, "grad_norm": 0.07633017824416555, "learning_rate": 8.248442950467845e-05, "loss": 0.2623, "step": 1369 }, { "epoch": 1.8026315789473686, "grad_norm": 0.07718202591821682, "learning_rate": 8.23337178313504e-05, "loss": 0.2565, "step": 1370 }, { "epoch": 1.8039473684210527, "grad_norm": 0.0807857739995025, "learning_rate": 8.218304756658072e-05, "loss": 0.2599, "step": 1371 }, { "epoch": 1.805263157894737, "grad_norm": 0.0790429723121207, "learning_rate": 8.203241906353014e-05, "loss": 0.2577, "step": 1372 }, { "epoch": 1.806578947368421, "grad_norm": 0.07931013296038247, "learning_rate": 8.188183267526161e-05, "loss": 0.2639, "step": 1373 }, { "epoch": 1.8078947368421052, "grad_norm": 0.07877666664803827, "learning_rate": 8.173128875473932e-05, "loss": 0.2563, "step": 1374 }, { "epoch": 1.8092105263157894, "grad_norm": 0.07782636186273496, "learning_rate": 8.158078765482796e-05, "loss": 0.2635, "step": 1375 }, { "epoch": 1.8105263157894735, "grad_norm": 0.07632603628610292, "learning_rate": 8.143032972829183e-05, "loss": 0.2568, "step": 1376 }, { "epoch": 1.811842105263158, "grad_norm": 0.07623514696597232, "learning_rate": 8.127991532779401e-05, "loss": 0.2498, "step": 1377 }, { "epoch": 1.813157894736842, "grad_norm": 0.0777236902872697, "learning_rate": 8.112954480589558e-05, "loss": 0.2534, "step": 1378 }, { "epoch": 1.8144736842105265, "grad_norm": 0.07656061477259994, "learning_rate": 8.09792185150548e-05, "loss": 0.26, "step": 1379 }, { "epoch": 1.8157894736842106, "grad_norm": 0.0789133570152323, "learning_rate": 8.082893680762619e-05, "loss": 0.2604, "step": 1380 }, { "epoch": 1.8171052631578948, "grad_norm": 0.07851474431282743, "learning_rate": 8.067870003585978e-05, "loss": 0.2663, "step": 1381 }, { "epoch": 1.818421052631579, "grad_norm": 0.0789616006642964, "learning_rate": 8.052850855190034e-05, "loss": 0.2638, "step": 1382 }, { "epoch": 1.819736842105263, "grad_norm": 0.07562051436427474, "learning_rate": 8.037836270778642e-05, "loss": 0.2512, "step": 1383 }, { "epoch": 1.8210526315789473, "grad_norm": 0.08100206522865458, "learning_rate": 8.022826285544968e-05, "loss": 0.2474, "step": 1384 }, { "epoch": 1.8223684210526314, "grad_norm": 0.0794818851301571, "learning_rate": 8.007820934671383e-05, "loss": 0.2713, "step": 1385 }, { "epoch": 1.8236842105263158, "grad_norm": 0.07947535099199077, "learning_rate": 7.992820253329409e-05, "loss": 0.2589, "step": 1386 }, { "epoch": 1.825, "grad_norm": 0.08012876631030338, "learning_rate": 7.977824276679623e-05, "loss": 0.2656, "step": 1387 }, { "epoch": 1.8263157894736843, "grad_norm": 0.0778147716148084, "learning_rate": 7.96283303987156e-05, "loss": 0.2574, "step": 1388 }, { "epoch": 1.8276315789473685, "grad_norm": 0.08108286483882464, "learning_rate": 7.947846578043659e-05, "loss": 0.2592, "step": 1389 }, { "epoch": 1.8289473684210527, "grad_norm": 0.07898753250610759, "learning_rate": 7.932864926323161e-05, "loss": 0.2612, "step": 1390 }, { "epoch": 1.8302631578947368, "grad_norm": 0.07750883606242706, "learning_rate": 7.917888119826036e-05, "loss": 0.2533, "step": 1391 }, { "epoch": 1.831578947368421, "grad_norm": 0.08012157144092143, "learning_rate": 7.902916193656898e-05, "loss": 0.2664, "step": 1392 }, { "epoch": 1.8328947368421051, "grad_norm": 0.07738904505017828, "learning_rate": 7.887949182908912e-05, "loss": 0.2599, "step": 1393 }, { "epoch": 1.8342105263157895, "grad_norm": 0.07769928944177891, "learning_rate": 7.872987122663733e-05, "loss": 0.2624, "step": 1394 }, { "epoch": 1.8355263157894737, "grad_norm": 0.08069668200882675, "learning_rate": 7.858030047991411e-05, "loss": 0.2589, "step": 1395 }, { "epoch": 1.836842105263158, "grad_norm": 0.07833127833956029, "learning_rate": 7.843077993950302e-05, "loss": 0.2572, "step": 1396 }, { "epoch": 1.8381578947368422, "grad_norm": 0.07971876020235151, "learning_rate": 7.828130995586998e-05, "loss": 0.2582, "step": 1397 }, { "epoch": 1.8394736842105264, "grad_norm": 0.07885772994939849, "learning_rate": 7.813189087936243e-05, "loss": 0.2738, "step": 1398 }, { "epoch": 1.8407894736842105, "grad_norm": 0.07736167226784646, "learning_rate": 7.798252306020851e-05, "loss": 0.2608, "step": 1399 }, { "epoch": 1.8421052631578947, "grad_norm": 0.07630819815719928, "learning_rate": 7.783320684851614e-05, "loss": 0.2613, "step": 1400 }, { "epoch": 1.8434210526315788, "grad_norm": 0.07834945368484907, "learning_rate": 7.768394259427234e-05, "loss": 0.2614, "step": 1401 }, { "epoch": 1.844736842105263, "grad_norm": 0.07718022449050989, "learning_rate": 7.753473064734232e-05, "loss": 0.2704, "step": 1402 }, { "epoch": 1.8460526315789474, "grad_norm": 0.07707441842552827, "learning_rate": 7.738557135746873e-05, "loss": 0.2576, "step": 1403 }, { "epoch": 1.8473684210526315, "grad_norm": 0.07812383749123156, "learning_rate": 7.72364650742707e-05, "loss": 0.2635, "step": 1404 }, { "epoch": 1.848684210526316, "grad_norm": 0.07972960299568377, "learning_rate": 7.708741214724315e-05, "loss": 0.2711, "step": 1405 }, { "epoch": 1.85, "grad_norm": 0.07567077698895516, "learning_rate": 7.693841292575598e-05, "loss": 0.2583, "step": 1406 }, { "epoch": 1.8513157894736842, "grad_norm": 0.07920973523092241, "learning_rate": 7.678946775905324e-05, "loss": 0.2565, "step": 1407 }, { "epoch": 1.8526315789473684, "grad_norm": 0.07722915516418472, "learning_rate": 7.664057699625214e-05, "loss": 0.2526, "step": 1408 }, { "epoch": 1.8539473684210526, "grad_norm": 0.08254943723021463, "learning_rate": 7.649174098634251e-05, "loss": 0.27, "step": 1409 }, { "epoch": 1.8552631578947367, "grad_norm": 0.0778109023166561, "learning_rate": 7.634296007818576e-05, "loss": 0.2516, "step": 1410 }, { "epoch": 1.856578947368421, "grad_norm": 0.07700945488788534, "learning_rate": 7.619423462051423e-05, "loss": 0.2646, "step": 1411 }, { "epoch": 1.8578947368421053, "grad_norm": 0.07630139704348335, "learning_rate": 7.604556496193015e-05, "loss": 0.2538, "step": 1412 }, { "epoch": 1.8592105263157894, "grad_norm": 0.07939635050751274, "learning_rate": 7.589695145090506e-05, "loss": 0.2667, "step": 1413 }, { "epoch": 1.8605263157894738, "grad_norm": 0.07616939180006002, "learning_rate": 7.57483944357789e-05, "loss": 0.2529, "step": 1414 }, { "epoch": 1.861842105263158, "grad_norm": 0.08117405461306933, "learning_rate": 7.559989426475917e-05, "loss": 0.2555, "step": 1415 }, { "epoch": 1.8631578947368421, "grad_norm": 0.08059557719232159, "learning_rate": 7.54514512859201e-05, "loss": 0.2628, "step": 1416 }, { "epoch": 1.8644736842105263, "grad_norm": 0.078588193817039, "learning_rate": 7.530306584720188e-05, "loss": 0.2615, "step": 1417 }, { "epoch": 1.8657894736842104, "grad_norm": 0.07865927809600427, "learning_rate": 7.515473829640987e-05, "loss": 0.2588, "step": 1418 }, { "epoch": 1.8671052631578946, "grad_norm": 0.07875855234210856, "learning_rate": 7.500646898121373e-05, "loss": 0.254, "step": 1419 }, { "epoch": 1.868421052631579, "grad_norm": 0.07994170191297952, "learning_rate": 7.485825824914659e-05, "loss": 0.2625, "step": 1420 }, { "epoch": 1.8697368421052631, "grad_norm": 0.07669419084742685, "learning_rate": 7.471010644760421e-05, "loss": 0.2639, "step": 1421 }, { "epoch": 1.8710526315789475, "grad_norm": 0.07671532012322799, "learning_rate": 7.456201392384436e-05, "loss": 0.2618, "step": 1422 }, { "epoch": 1.8723684210526317, "grad_norm": 0.0760382420175465, "learning_rate": 7.441398102498582e-05, "loss": 0.2586, "step": 1423 }, { "epoch": 1.8736842105263158, "grad_norm": 0.07479447004096319, "learning_rate": 7.426600809800752e-05, "loss": 0.2487, "step": 1424 }, { "epoch": 1.875, "grad_norm": 0.07952054209905557, "learning_rate": 7.411809548974792e-05, "loss": 0.2694, "step": 1425 }, { "epoch": 1.8763157894736842, "grad_norm": 0.0795536063574035, "learning_rate": 7.397024354690408e-05, "loss": 0.2575, "step": 1426 }, { "epoch": 1.8776315789473683, "grad_norm": 0.07893539617767596, "learning_rate": 7.382245261603088e-05, "loss": 0.271, "step": 1427 }, { "epoch": 1.8789473684210525, "grad_norm": 0.08003786520908833, "learning_rate": 7.36747230435401e-05, "loss": 0.2499, "step": 1428 }, { "epoch": 1.8802631578947369, "grad_norm": 0.078791451136056, "learning_rate": 7.352705517569977e-05, "loss": 0.2674, "step": 1429 }, { "epoch": 1.881578947368421, "grad_norm": 0.0761073189754986, "learning_rate": 7.337944935863333e-05, "loss": 0.2561, "step": 1430 }, { "epoch": 1.8828947368421054, "grad_norm": 0.07783417880514816, "learning_rate": 7.32319059383187e-05, "loss": 0.2566, "step": 1431 }, { "epoch": 1.8842105263157896, "grad_norm": 0.07782790823841897, "learning_rate": 7.308442526058756e-05, "loss": 0.2681, "step": 1432 }, { "epoch": 1.8855263157894737, "grad_norm": 0.07722518403441637, "learning_rate": 7.293700767112458e-05, "loss": 0.2581, "step": 1433 }, { "epoch": 1.8868421052631579, "grad_norm": 0.07897906723081582, "learning_rate": 7.278965351546648e-05, "loss": 0.2618, "step": 1434 }, { "epoch": 1.888157894736842, "grad_norm": 0.07503201774332066, "learning_rate": 7.264236313900141e-05, "loss": 0.2449, "step": 1435 }, { "epoch": 1.8894736842105262, "grad_norm": 0.07528777047208206, "learning_rate": 7.249513688696786e-05, "loss": 0.2509, "step": 1436 }, { "epoch": 1.8907894736842106, "grad_norm": 0.07701172257337681, "learning_rate": 7.234797510445411e-05, "loss": 0.2502, "step": 1437 }, { "epoch": 1.8921052631578947, "grad_norm": 0.07610174570738121, "learning_rate": 7.220087813639736e-05, "loss": 0.2551, "step": 1438 }, { "epoch": 1.893421052631579, "grad_norm": 0.07662368675063383, "learning_rate": 7.205384632758285e-05, "loss": 0.2611, "step": 1439 }, { "epoch": 1.8947368421052633, "grad_norm": 0.07837947549366885, "learning_rate": 7.190688002264308e-05, "loss": 0.2587, "step": 1440 }, { "epoch": 1.8960526315789474, "grad_norm": 0.07828653663313988, "learning_rate": 7.175997956605701e-05, "loss": 0.2542, "step": 1441 }, { "epoch": 1.8973684210526316, "grad_norm": 0.07749211808620897, "learning_rate": 7.161314530214931e-05, "loss": 0.2554, "step": 1442 }, { "epoch": 1.8986842105263158, "grad_norm": 0.07955661477515108, "learning_rate": 7.146637757508949e-05, "loss": 0.2465, "step": 1443 }, { "epoch": 1.9, "grad_norm": 0.0780534289115777, "learning_rate": 7.131967672889101e-05, "loss": 0.2587, "step": 1444 }, { "epoch": 1.901315789473684, "grad_norm": 0.08230240962537452, "learning_rate": 7.117304310741062e-05, "loss": 0.2676, "step": 1445 }, { "epoch": 1.9026315789473685, "grad_norm": 0.07514699270244664, "learning_rate": 7.102647705434756e-05, "loss": 0.2515, "step": 1446 }, { "epoch": 1.9039473684210526, "grad_norm": 0.07838555881596923, "learning_rate": 7.087997891324262e-05, "loss": 0.2676, "step": 1447 }, { "epoch": 1.905263157894737, "grad_norm": 0.07794040393822359, "learning_rate": 7.073354902747741e-05, "loss": 0.2622, "step": 1448 }, { "epoch": 1.9065789473684212, "grad_norm": 0.07793028180395223, "learning_rate": 7.058718774027364e-05, "loss": 0.2581, "step": 1449 }, { "epoch": 1.9078947368421053, "grad_norm": 0.07468275561587644, "learning_rate": 7.044089539469212e-05, "loss": 0.2536, "step": 1450 }, { "epoch": 1.9092105263157895, "grad_norm": 0.07736426080010585, "learning_rate": 7.029467233363216e-05, "loss": 0.2607, "step": 1451 }, { "epoch": 1.9105263157894736, "grad_norm": 0.07788040509177954, "learning_rate": 7.014851889983057e-05, "loss": 0.2699, "step": 1452 }, { "epoch": 1.9118421052631578, "grad_norm": 0.07482713534215896, "learning_rate": 7.000243543586102e-05, "loss": 0.2562, "step": 1453 }, { "epoch": 1.913157894736842, "grad_norm": 0.07738656143600638, "learning_rate": 6.985642228413316e-05, "loss": 0.2589, "step": 1454 }, { "epoch": 1.9144736842105263, "grad_norm": 0.0771729364408806, "learning_rate": 6.971047978689189e-05, "loss": 0.2651, "step": 1455 }, { "epoch": 1.9157894736842105, "grad_norm": 0.07648368441271167, "learning_rate": 6.95646082862164e-05, "loss": 0.2568, "step": 1456 }, { "epoch": 1.9171052631578949, "grad_norm": 0.07769870758338829, "learning_rate": 6.941880812401956e-05, "loss": 0.2537, "step": 1457 }, { "epoch": 1.918421052631579, "grad_norm": 0.07903999465640997, "learning_rate": 6.927307964204694e-05, "loss": 0.2666, "step": 1458 }, { "epoch": 1.9197368421052632, "grad_norm": 0.08045066676431731, "learning_rate": 6.912742318187624e-05, "loss": 0.2605, "step": 1459 }, { "epoch": 1.9210526315789473, "grad_norm": 0.07627530893025287, "learning_rate": 6.898183908491617e-05, "loss": 0.255, "step": 1460 }, { "epoch": 1.9223684210526315, "grad_norm": 0.07918035796815091, "learning_rate": 6.883632769240589e-05, "loss": 0.2565, "step": 1461 }, { "epoch": 1.9236842105263157, "grad_norm": 0.079331461000813, "learning_rate": 6.869088934541419e-05, "loss": 0.2595, "step": 1462 }, { "epoch": 1.925, "grad_norm": 0.07819193010414092, "learning_rate": 6.854552438483865e-05, "loss": 0.2602, "step": 1463 }, { "epoch": 1.9263157894736842, "grad_norm": 0.08127166962507164, "learning_rate": 6.840023315140475e-05, "loss": 0.2563, "step": 1464 }, { "epoch": 1.9276315789473686, "grad_norm": 0.08070961078490956, "learning_rate": 6.825501598566525e-05, "loss": 0.259, "step": 1465 }, { "epoch": 1.9289473684210527, "grad_norm": 0.0766943472529489, "learning_rate": 6.810987322799926e-05, "loss": 0.2586, "step": 1466 }, { "epoch": 1.930263157894737, "grad_norm": 0.08113329672328959, "learning_rate": 6.79648052186115e-05, "loss": 0.2425, "step": 1467 }, { "epoch": 1.931578947368421, "grad_norm": 0.07603940923660328, "learning_rate": 6.781981229753145e-05, "loss": 0.2534, "step": 1468 }, { "epoch": 1.9328947368421052, "grad_norm": 0.07646855224356787, "learning_rate": 6.76748948046126e-05, "loss": 0.2506, "step": 1469 }, { "epoch": 1.9342105263157894, "grad_norm": 0.07477033930912226, "learning_rate": 6.753005307953167e-05, "loss": 0.2509, "step": 1470 }, { "epoch": 1.9355263157894735, "grad_norm": 0.07717545781453299, "learning_rate": 6.738528746178776e-05, "loss": 0.2576, "step": 1471 }, { "epoch": 1.936842105263158, "grad_norm": 0.07742807164222197, "learning_rate": 6.724059829070158e-05, "loss": 0.2647, "step": 1472 }, { "epoch": 1.938157894736842, "grad_norm": 0.07798070076914343, "learning_rate": 6.709598590541469e-05, "loss": 0.2593, "step": 1473 }, { "epoch": 1.9394736842105265, "grad_norm": 0.07717711873805973, "learning_rate": 6.695145064488861e-05, "loss": 0.2589, "step": 1474 }, { "epoch": 1.9407894736842106, "grad_norm": 0.07645570062299177, "learning_rate": 6.680699284790415e-05, "loss": 0.2612, "step": 1475 }, { "epoch": 1.9421052631578948, "grad_norm": 0.07709702657938383, "learning_rate": 6.666261285306047e-05, "loss": 0.2643, "step": 1476 }, { "epoch": 1.943421052631579, "grad_norm": 0.07634109328627783, "learning_rate": 6.651831099877444e-05, "loss": 0.2468, "step": 1477 }, { "epoch": 1.944736842105263, "grad_norm": 0.07783850878336081, "learning_rate": 6.637408762327972e-05, "loss": 0.2607, "step": 1478 }, { "epoch": 1.9460526315789473, "grad_norm": 0.07820503016401602, "learning_rate": 6.622994306462611e-05, "loss": 0.2574, "step": 1479 }, { "epoch": 1.9473684210526314, "grad_norm": 0.0796202316851204, "learning_rate": 6.608587766067852e-05, "loss": 0.2575, "step": 1480 }, { "epoch": 1.9486842105263158, "grad_norm": 0.07605261401674518, "learning_rate": 6.59418917491165e-05, "loss": 0.2591, "step": 1481 }, { "epoch": 1.95, "grad_norm": 0.07560659095541633, "learning_rate": 6.579798566743314e-05, "loss": 0.2526, "step": 1482 }, { "epoch": 1.9513157894736843, "grad_norm": 0.07747930757786237, "learning_rate": 6.565415975293448e-05, "loss": 0.2631, "step": 1483 }, { "epoch": 1.9526315789473685, "grad_norm": 0.07643184520906729, "learning_rate": 6.551041434273861e-05, "loss": 0.2508, "step": 1484 }, { "epoch": 1.9539473684210527, "grad_norm": 0.0796433446523202, "learning_rate": 6.536674977377496e-05, "loss": 0.2543, "step": 1485 }, { "epoch": 1.9552631578947368, "grad_norm": 0.07657952724716138, "learning_rate": 6.522316638278347e-05, "loss": 0.2552, "step": 1486 }, { "epoch": 1.956578947368421, "grad_norm": 0.07677380083558996, "learning_rate": 6.507966450631382e-05, "loss": 0.2541, "step": 1487 }, { "epoch": 1.9578947368421051, "grad_norm": 0.07595336702259202, "learning_rate": 6.493624448072457e-05, "loss": 0.2487, "step": 1488 }, { "epoch": 1.9592105263157895, "grad_norm": 0.07721242087118127, "learning_rate": 6.479290664218247e-05, "loss": 0.2507, "step": 1489 }, { "epoch": 1.9605263157894737, "grad_norm": 0.07755079832687756, "learning_rate": 6.464965132666163e-05, "loss": 0.2574, "step": 1490 }, { "epoch": 1.961842105263158, "grad_norm": 0.07980082932294216, "learning_rate": 6.450647886994272e-05, "loss": 0.2704, "step": 1491 }, { "epoch": 1.9631578947368422, "grad_norm": 0.07840280028957075, "learning_rate": 6.43633896076122e-05, "loss": 0.2565, "step": 1492 }, { "epoch": 1.9644736842105264, "grad_norm": 0.07780809182938828, "learning_rate": 6.422038387506149e-05, "loss": 0.2626, "step": 1493 }, { "epoch": 1.9657894736842105, "grad_norm": 0.07710732052984494, "learning_rate": 6.407746200748628e-05, "loss": 0.2565, "step": 1494 }, { "epoch": 1.9671052631578947, "grad_norm": 0.07590458797290023, "learning_rate": 6.393462433988569e-05, "loss": 0.2547, "step": 1495 }, { "epoch": 1.9684210526315788, "grad_norm": 0.07792407917489579, "learning_rate": 6.379187120706138e-05, "loss": 0.2466, "step": 1496 }, { "epoch": 1.969736842105263, "grad_norm": 0.07781793554961416, "learning_rate": 6.3649202943617e-05, "loss": 0.266, "step": 1497 }, { "epoch": 1.9710526315789474, "grad_norm": 0.07455889127205702, "learning_rate": 6.350661988395723e-05, "loss": 0.2458, "step": 1498 }, { "epoch": 1.9723684210526315, "grad_norm": 0.07493719255336764, "learning_rate": 6.336412236228697e-05, "loss": 0.2565, "step": 1499 }, { "epoch": 1.973684210526316, "grad_norm": 0.07690525334306028, "learning_rate": 6.322171071261071e-05, "loss": 0.2613, "step": 1500 }, { "epoch": 1.975, "grad_norm": 0.07579197410852237, "learning_rate": 6.307938526873157e-05, "loss": 0.2523, "step": 1501 }, { "epoch": 1.9763157894736842, "grad_norm": 0.08008252829550877, "learning_rate": 6.293714636425071e-05, "loss": 0.2671, "step": 1502 }, { "epoch": 1.9776315789473684, "grad_norm": 0.07781399370339455, "learning_rate": 6.279499433256642e-05, "loss": 0.2603, "step": 1503 }, { "epoch": 1.9789473684210526, "grad_norm": 0.07813006074806125, "learning_rate": 6.26529295068733e-05, "loss": 0.2493, "step": 1504 }, { "epoch": 1.9802631578947367, "grad_norm": 0.07894579521693036, "learning_rate": 6.251095222016162e-05, "loss": 0.2624, "step": 1505 }, { "epoch": 1.981578947368421, "grad_norm": 0.07494640550743084, "learning_rate": 6.236906280521646e-05, "loss": 0.2409, "step": 1506 }, { "epoch": 1.9828947368421053, "grad_norm": 0.07893332158347492, "learning_rate": 6.22272615946169e-05, "loss": 0.2629, "step": 1507 }, { "epoch": 1.9842105263157894, "grad_norm": 0.07842421722733443, "learning_rate": 6.208554892073528e-05, "loss": 0.2504, "step": 1508 }, { "epoch": 1.9855263157894738, "grad_norm": 0.08117542344228214, "learning_rate": 6.19439251157364e-05, "loss": 0.2653, "step": 1509 }, { "epoch": 1.986842105263158, "grad_norm": 0.07755918045728276, "learning_rate": 6.180239051157681e-05, "loss": 0.2558, "step": 1510 }, { "epoch": 1.9881578947368421, "grad_norm": 0.07645794314850728, "learning_rate": 6.166094544000398e-05, "loss": 0.2559, "step": 1511 }, { "epoch": 1.9894736842105263, "grad_norm": 0.07640733998010549, "learning_rate": 6.151959023255545e-05, "loss": 0.2537, "step": 1512 }, { "epoch": 1.9907894736842104, "grad_norm": 0.0768529160978783, "learning_rate": 6.137832522055817e-05, "loss": 0.2601, "step": 1513 }, { "epoch": 1.9921052631578946, "grad_norm": 0.07847739643759478, "learning_rate": 6.123715073512772e-05, "loss": 0.255, "step": 1514 }, { "epoch": 1.993421052631579, "grad_norm": 0.07449855656212337, "learning_rate": 6.109606710716741e-05, "loss": 0.239, "step": 1515 }, { "epoch": 1.9947368421052631, "grad_norm": 0.07846660941255253, "learning_rate": 6.095507466736763e-05, "loss": 0.2551, "step": 1516 }, { "epoch": 1.9960526315789475, "grad_norm": 0.07789468698938513, "learning_rate": 6.0814173746205e-05, "loss": 0.2694, "step": 1517 }, { "epoch": 1.9973684210526317, "grad_norm": 0.077200397264026, "learning_rate": 6.067336467394169e-05, "loss": 0.2589, "step": 1518 }, { "epoch": 1.9986842105263158, "grad_norm": 0.08246944198719414, "learning_rate": 6.0532647780624554e-05, "loss": 0.2656, "step": 1519 }, { "epoch": 2.0, "grad_norm": 0.07831394886097501, "learning_rate": 6.039202339608432e-05, "loss": 0.2656, "step": 1520 }, { "epoch": 2.0, "eval_loss": 0.2646636962890625, "eval_runtime": 136.4654, "eval_samples_per_second": 37.504, "eval_steps_per_second": 1.172, "step": 1520 }, { "epoch": 2.001315789473684, "grad_norm": 0.07478380723960856, "learning_rate": 6.025149184993498e-05, "loss": 0.2524, "step": 1521 }, { "epoch": 2.0026315789473683, "grad_norm": 0.07372772894034235, "learning_rate": 6.011105347157289e-05, "loss": 0.2399, "step": 1522 }, { "epoch": 2.0039473684210525, "grad_norm": 0.07430175019575523, "learning_rate": 5.9970708590175986e-05, "loss": 0.2369, "step": 1523 }, { "epoch": 2.0052631578947366, "grad_norm": 0.07626311971922609, "learning_rate": 5.983045753470308e-05, "loss": 0.2371, "step": 1524 }, { "epoch": 2.0065789473684212, "grad_norm": 0.07792913324419348, "learning_rate": 5.969030063389305e-05, "loss": 0.2372, "step": 1525 }, { "epoch": 2.0078947368421054, "grad_norm": 0.07819835291288194, "learning_rate": 5.955023821626411e-05, "loss": 0.2299, "step": 1526 }, { "epoch": 2.0092105263157896, "grad_norm": 0.0782044620700486, "learning_rate": 5.941027061011303e-05, "loss": 0.243, "step": 1527 }, { "epoch": 2.0105263157894737, "grad_norm": 0.08211154278042845, "learning_rate": 5.927039814351426e-05, "loss": 0.2432, "step": 1528 }, { "epoch": 2.011842105263158, "grad_norm": 0.08053029149065172, "learning_rate": 5.9130621144319334e-05, "loss": 0.2349, "step": 1529 }, { "epoch": 2.013157894736842, "grad_norm": 0.08170774492319388, "learning_rate": 5.8990939940156e-05, "loss": 0.2455, "step": 1530 }, { "epoch": 2.014473684210526, "grad_norm": 0.07949305041445266, "learning_rate": 5.885135485842743e-05, "loss": 0.2351, "step": 1531 }, { "epoch": 2.0157894736842104, "grad_norm": 0.08110005829531224, "learning_rate": 5.8711866226311553e-05, "loss": 0.2579, "step": 1532 }, { "epoch": 2.017105263157895, "grad_norm": 0.08333787912458002, "learning_rate": 5.857247437076012e-05, "loss": 0.2387, "step": 1533 }, { "epoch": 2.018421052631579, "grad_norm": 0.08010741675929599, "learning_rate": 5.843317961849818e-05, "loss": 0.2299, "step": 1534 }, { "epoch": 2.0197368421052633, "grad_norm": 0.08205136086453177, "learning_rate": 5.829398229602312e-05, "loss": 0.2422, "step": 1535 }, { "epoch": 2.0210526315789474, "grad_norm": 0.08134779329404473, "learning_rate": 5.8154882729603876e-05, "loss": 0.2488, "step": 1536 }, { "epoch": 2.0223684210526316, "grad_norm": 0.0793049391780135, "learning_rate": 5.8015881245280436e-05, "loss": 0.2404, "step": 1537 }, { "epoch": 2.0236842105263158, "grad_norm": 0.07994738843779876, "learning_rate": 5.787697816886273e-05, "loss": 0.2309, "step": 1538 }, { "epoch": 2.025, "grad_norm": 0.07995832117150292, "learning_rate": 5.773817382593008e-05, "loss": 0.2243, "step": 1539 }, { "epoch": 2.026315789473684, "grad_norm": 0.0805658044178454, "learning_rate": 5.7599468541830356e-05, "loss": 0.236, "step": 1540 }, { "epoch": 2.0276315789473682, "grad_norm": 0.082233376699682, "learning_rate": 5.7460862641679316e-05, "loss": 0.2395, "step": 1541 }, { "epoch": 2.028947368421053, "grad_norm": 0.08455600799602911, "learning_rate": 5.732235645035964e-05, "loss": 0.2527, "step": 1542 }, { "epoch": 2.030263157894737, "grad_norm": 0.08166867758119097, "learning_rate": 5.7183950292520473e-05, "loss": 0.2247, "step": 1543 }, { "epoch": 2.031578947368421, "grad_norm": 0.08362795390398718, "learning_rate": 5.7045644492576346e-05, "loss": 0.2334, "step": 1544 }, { "epoch": 2.0328947368421053, "grad_norm": 0.08759943221985574, "learning_rate": 5.690743937470657e-05, "loss": 0.25, "step": 1545 }, { "epoch": 2.0342105263157895, "grad_norm": 0.0834958876411245, "learning_rate": 5.676933526285457e-05, "loss": 0.2416, "step": 1546 }, { "epoch": 2.0355263157894736, "grad_norm": 0.08052792625700166, "learning_rate": 5.663133248072692e-05, "loss": 0.2363, "step": 1547 }, { "epoch": 2.036842105263158, "grad_norm": 0.08077196722556565, "learning_rate": 5.64934313517927e-05, "loss": 0.2395, "step": 1548 }, { "epoch": 2.038157894736842, "grad_norm": 0.08231061675764222, "learning_rate": 5.635563219928275e-05, "loss": 0.2467, "step": 1549 }, { "epoch": 2.039473684210526, "grad_norm": 0.08140686496008916, "learning_rate": 5.62179353461888e-05, "loss": 0.2399, "step": 1550 }, { "epoch": 2.0407894736842107, "grad_norm": 0.08075751241578549, "learning_rate": 5.608034111526298e-05, "loss": 0.2262, "step": 1551 }, { "epoch": 2.042105263157895, "grad_norm": 0.0814531147960288, "learning_rate": 5.5942849829016695e-05, "loss": 0.2292, "step": 1552 }, { "epoch": 2.043421052631579, "grad_norm": 0.0807479054094009, "learning_rate": 5.580546180972011e-05, "loss": 0.243, "step": 1553 }, { "epoch": 2.044736842105263, "grad_norm": 0.08012998797966009, "learning_rate": 5.566817737940142e-05, "loss": 0.232, "step": 1554 }, { "epoch": 2.0460526315789473, "grad_norm": 0.08437656676259887, "learning_rate": 5.553099685984591e-05, "loss": 0.2499, "step": 1555 }, { "epoch": 2.0473684210526315, "grad_norm": 0.08389268402272403, "learning_rate": 5.5393920572595356e-05, "loss": 0.2318, "step": 1556 }, { "epoch": 2.0486842105263157, "grad_norm": 0.08099986813273759, "learning_rate": 5.52569488389472e-05, "loss": 0.2408, "step": 1557 }, { "epoch": 2.05, "grad_norm": 0.08017638799598005, "learning_rate": 5.5120081979953785e-05, "loss": 0.2276, "step": 1558 }, { "epoch": 2.0513157894736844, "grad_norm": 0.08313509302134009, "learning_rate": 5.498332031642177e-05, "loss": 0.2464, "step": 1559 }, { "epoch": 2.0526315789473686, "grad_norm": 0.08458917823416191, "learning_rate": 5.484666416891109e-05, "loss": 0.2402, "step": 1560 }, { "epoch": 2.0539473684210527, "grad_norm": 0.08235673729736151, "learning_rate": 5.4710113857734394e-05, "loss": 0.2316, "step": 1561 }, { "epoch": 2.055263157894737, "grad_norm": 0.08137519978808737, "learning_rate": 5.457366970295634e-05, "loss": 0.2354, "step": 1562 }, { "epoch": 2.056578947368421, "grad_norm": 0.08503575588624475, "learning_rate": 5.4437332024392694e-05, "loss": 0.2356, "step": 1563 }, { "epoch": 2.057894736842105, "grad_norm": 0.08267486713807648, "learning_rate": 5.430110114160964e-05, "loss": 0.2492, "step": 1564 }, { "epoch": 2.0592105263157894, "grad_norm": 0.08202736940009933, "learning_rate": 5.416497737392308e-05, "loss": 0.2418, "step": 1565 }, { "epoch": 2.0605263157894735, "grad_norm": 0.0848040424518247, "learning_rate": 5.402896104039776e-05, "loss": 0.2385, "step": 1566 }, { "epoch": 2.0618421052631577, "grad_norm": 0.08202439572712068, "learning_rate": 5.389305245984675e-05, "loss": 0.2411, "step": 1567 }, { "epoch": 2.0631578947368423, "grad_norm": 0.08444110111591935, "learning_rate": 5.375725195083046e-05, "loss": 0.2365, "step": 1568 }, { "epoch": 2.0644736842105265, "grad_norm": 0.08419157425499689, "learning_rate": 5.362155983165594e-05, "loss": 0.2472, "step": 1569 }, { "epoch": 2.0657894736842106, "grad_norm": 0.08359202979261496, "learning_rate": 5.3485976420376336e-05, "loss": 0.2495, "step": 1570 }, { "epoch": 2.067105263157895, "grad_norm": 0.08127010161214182, "learning_rate": 5.335050203478988e-05, "loss": 0.2319, "step": 1571 }, { "epoch": 2.068421052631579, "grad_norm": 0.08168717674226678, "learning_rate": 5.321513699243924e-05, "loss": 0.2339, "step": 1572 }, { "epoch": 2.069736842105263, "grad_norm": 0.08449279952945175, "learning_rate": 5.307988161061085e-05, "loss": 0.238, "step": 1573 }, { "epoch": 2.0710526315789473, "grad_norm": 0.08225377668981279, "learning_rate": 5.2944736206334034e-05, "loss": 0.2409, "step": 1574 }, { "epoch": 2.0723684210526314, "grad_norm": 0.0843365596375047, "learning_rate": 5.280970109638047e-05, "loss": 0.246, "step": 1575 }, { "epoch": 2.0736842105263156, "grad_norm": 0.08534819131960646, "learning_rate": 5.2674776597263186e-05, "loss": 0.2487, "step": 1576 }, { "epoch": 2.075, "grad_norm": 0.08083479176795237, "learning_rate": 5.253996302523596e-05, "loss": 0.2432, "step": 1577 }, { "epoch": 2.0763157894736843, "grad_norm": 0.0813017134645149, "learning_rate": 5.240526069629265e-05, "loss": 0.2426, "step": 1578 }, { "epoch": 2.0776315789473685, "grad_norm": 0.08318443053477804, "learning_rate": 5.227066992616629e-05, "loss": 0.2451, "step": 1579 }, { "epoch": 2.0789473684210527, "grad_norm": 0.0833772227653302, "learning_rate": 5.2136191030328455e-05, "loss": 0.2351, "step": 1580 }, { "epoch": 2.080263157894737, "grad_norm": 0.08342495654640115, "learning_rate": 5.2001824323988455e-05, "loss": 0.232, "step": 1581 }, { "epoch": 2.081578947368421, "grad_norm": 0.08272035294268948, "learning_rate": 5.1867570122092666e-05, "loss": 0.2383, "step": 1582 }, { "epoch": 2.082894736842105, "grad_norm": 0.08074034588106882, "learning_rate": 5.173342873932383e-05, "loss": 0.232, "step": 1583 }, { "epoch": 2.0842105263157893, "grad_norm": 0.08255542214386506, "learning_rate": 5.159940049010015e-05, "loss": 0.2346, "step": 1584 }, { "epoch": 2.085526315789474, "grad_norm": 0.08454238319499686, "learning_rate": 5.146548568857462e-05, "loss": 0.2421, "step": 1585 }, { "epoch": 2.086842105263158, "grad_norm": 0.084726345772104, "learning_rate": 5.133168464863449e-05, "loss": 0.2387, "step": 1586 }, { "epoch": 2.088157894736842, "grad_norm": 0.08212224275163124, "learning_rate": 5.1197997683900214e-05, "loss": 0.2352, "step": 1587 }, { "epoch": 2.0894736842105264, "grad_norm": 0.08285955543488045, "learning_rate": 5.106442510772489e-05, "loss": 0.2246, "step": 1588 }, { "epoch": 2.0907894736842105, "grad_norm": 0.08221363824415492, "learning_rate": 5.0930967233193504e-05, "loss": 0.2313, "step": 1589 }, { "epoch": 2.0921052631578947, "grad_norm": 0.08240239248129916, "learning_rate": 5.079762437312219e-05, "loss": 0.2374, "step": 1590 }, { "epoch": 2.093421052631579, "grad_norm": 0.0833112285945494, "learning_rate": 5.066439684005755e-05, "loss": 0.2354, "step": 1591 }, { "epoch": 2.094736842105263, "grad_norm": 0.08005461008140058, "learning_rate": 5.0531284946275784e-05, "loss": 0.215, "step": 1592 }, { "epoch": 2.096052631578947, "grad_norm": 0.08221155288324693, "learning_rate": 5.039828900378204e-05, "loss": 0.2298, "step": 1593 }, { "epoch": 2.0973684210526318, "grad_norm": 0.08430361712412235, "learning_rate": 5.02654093243098e-05, "loss": 0.2391, "step": 1594 }, { "epoch": 2.098684210526316, "grad_norm": 0.08646135396687035, "learning_rate": 5.013264621931991e-05, "loss": 0.2382, "step": 1595 }, { "epoch": 2.1, "grad_norm": 0.08349700203559438, "learning_rate": 5.000000000000002e-05, "loss": 0.2353, "step": 1596 }, { "epoch": 2.1013157894736842, "grad_norm": 0.08425984836395248, "learning_rate": 4.986747097726381e-05, "loss": 0.2298, "step": 1597 }, { "epoch": 2.1026315789473684, "grad_norm": 0.08389516910120101, "learning_rate": 4.97350594617502e-05, "loss": 0.237, "step": 1598 }, { "epoch": 2.1039473684210526, "grad_norm": 0.08297711753798366, "learning_rate": 4.960276576382283e-05, "loss": 0.2388, "step": 1599 }, { "epoch": 2.1052631578947367, "grad_norm": 0.08353450960448826, "learning_rate": 4.9470590193569044e-05, "loss": 0.2448, "step": 1600 }, { "epoch": 2.106578947368421, "grad_norm": 0.08489883369548581, "learning_rate": 4.9338533060799306e-05, "loss": 0.2468, "step": 1601 }, { "epoch": 2.1078947368421055, "grad_norm": 0.08015122761732316, "learning_rate": 4.920659467504659e-05, "loss": 0.2311, "step": 1602 }, { "epoch": 2.1092105263157896, "grad_norm": 0.0825490688742884, "learning_rate": 4.907477534556542e-05, "loss": 0.237, "step": 1603 }, { "epoch": 2.110526315789474, "grad_norm": 0.08391033769643125, "learning_rate": 4.894307538133129e-05, "loss": 0.2384, "step": 1604 }, { "epoch": 2.111842105263158, "grad_norm": 0.08420377491425757, "learning_rate": 4.8811495091039926e-05, "loss": 0.2365, "step": 1605 }, { "epoch": 2.113157894736842, "grad_norm": 0.08279227752642704, "learning_rate": 4.868003478310651e-05, "loss": 0.2335, "step": 1606 }, { "epoch": 2.1144736842105263, "grad_norm": 0.08322872043307299, "learning_rate": 4.854869476566508e-05, "loss": 0.2309, "step": 1607 }, { "epoch": 2.1157894736842104, "grad_norm": 0.08569452755597119, "learning_rate": 4.841747534656763e-05, "loss": 0.2365, "step": 1608 }, { "epoch": 2.1171052631578946, "grad_norm": 0.08195650643162714, "learning_rate": 4.828637683338347e-05, "loss": 0.2304, "step": 1609 }, { "epoch": 2.1184210526315788, "grad_norm": 0.08713567384955483, "learning_rate": 4.815539953339865e-05, "loss": 0.2411, "step": 1610 }, { "epoch": 2.1197368421052634, "grad_norm": 0.08545867868134706, "learning_rate": 4.802454375361495e-05, "loss": 0.2418, "step": 1611 }, { "epoch": 2.1210526315789475, "grad_norm": 0.08523598311058483, "learning_rate": 4.7893809800749403e-05, "loss": 0.2328, "step": 1612 }, { "epoch": 2.1223684210526317, "grad_norm": 0.08672971700799055, "learning_rate": 4.776319798123344e-05, "loss": 0.2357, "step": 1613 }, { "epoch": 2.123684210526316, "grad_norm": 0.08589391450643363, "learning_rate": 4.763270860121222e-05, "loss": 0.2376, "step": 1614 }, { "epoch": 2.125, "grad_norm": 0.08657771324163321, "learning_rate": 4.7502341966544e-05, "loss": 0.2409, "step": 1615 }, { "epoch": 2.126315789473684, "grad_norm": 0.08553999858199614, "learning_rate": 4.737209838279922e-05, "loss": 0.2498, "step": 1616 }, { "epoch": 2.1276315789473683, "grad_norm": 0.08346189155987148, "learning_rate": 4.7241978155259925e-05, "loss": 0.2382, "step": 1617 }, { "epoch": 2.1289473684210525, "grad_norm": 0.0832168506244348, "learning_rate": 4.7111981588919084e-05, "loss": 0.2362, "step": 1618 }, { "epoch": 2.1302631578947366, "grad_norm": 0.08400154849121748, "learning_rate": 4.698210898847976e-05, "loss": 0.2397, "step": 1619 }, { "epoch": 2.1315789473684212, "grad_norm": 0.08285572712672996, "learning_rate": 4.685236065835443e-05, "loss": 0.2385, "step": 1620 }, { "epoch": 2.1328947368421054, "grad_norm": 0.08317759957463747, "learning_rate": 4.6722736902664334e-05, "loss": 0.2406, "step": 1621 }, { "epoch": 2.1342105263157896, "grad_norm": 0.08245858802078364, "learning_rate": 4.659323802523864e-05, "loss": 0.2391, "step": 1622 }, { "epoch": 2.1355263157894737, "grad_norm": 0.08197930733916409, "learning_rate": 4.646386432961396e-05, "loss": 0.2319, "step": 1623 }, { "epoch": 2.136842105263158, "grad_norm": 0.08493589285679196, "learning_rate": 4.6334616119033356e-05, "loss": 0.243, "step": 1624 }, { "epoch": 2.138157894736842, "grad_norm": 0.08490735296967258, "learning_rate": 4.6205493696445754e-05, "loss": 0.236, "step": 1625 }, { "epoch": 2.139473684210526, "grad_norm": 0.08540155245746917, "learning_rate": 4.6076497364505386e-05, "loss": 0.2517, "step": 1626 }, { "epoch": 2.1407894736842104, "grad_norm": 0.08620766196769589, "learning_rate": 4.594762742557078e-05, "loss": 0.2317, "step": 1627 }, { "epoch": 2.1421052631578945, "grad_norm": 0.08354790535329532, "learning_rate": 4.5818884181704294e-05, "loss": 0.2265, "step": 1628 }, { "epoch": 2.143421052631579, "grad_norm": 0.08468876926659806, "learning_rate": 4.569026793467126e-05, "loss": 0.2323, "step": 1629 }, { "epoch": 2.1447368421052633, "grad_norm": 0.08422922304528441, "learning_rate": 4.5561778985939366e-05, "loss": 0.231, "step": 1630 }, { "epoch": 2.1460526315789474, "grad_norm": 0.08597404500241558, "learning_rate": 4.543341763667799e-05, "loss": 0.2455, "step": 1631 }, { "epoch": 2.1473684210526316, "grad_norm": 0.08569530326388924, "learning_rate": 4.530518418775733e-05, "loss": 0.2374, "step": 1632 }, { "epoch": 2.1486842105263158, "grad_norm": 0.08651611173999371, "learning_rate": 4.5177078939747796e-05, "loss": 0.2491, "step": 1633 }, { "epoch": 2.15, "grad_norm": 0.08435793058285544, "learning_rate": 4.50491021929194e-05, "loss": 0.2414, "step": 1634 }, { "epoch": 2.151315789473684, "grad_norm": 0.08452273206770455, "learning_rate": 4.492125424724086e-05, "loss": 0.2389, "step": 1635 }, { "epoch": 2.1526315789473682, "grad_norm": 0.08400832223995384, "learning_rate": 4.479353540237903e-05, "loss": 0.2509, "step": 1636 }, { "epoch": 2.153947368421053, "grad_norm": 0.0864191829115749, "learning_rate": 4.466594595769814e-05, "loss": 0.2546, "step": 1637 }, { "epoch": 2.155263157894737, "grad_norm": 0.08469120634147317, "learning_rate": 4.453848621225912e-05, "loss": 0.2357, "step": 1638 }, { "epoch": 2.156578947368421, "grad_norm": 0.08564901017872756, "learning_rate": 4.441115646481896e-05, "loss": 0.2367, "step": 1639 }, { "epoch": 2.1578947368421053, "grad_norm": 0.08581568319788428, "learning_rate": 4.4283957013829846e-05, "loss": 0.238, "step": 1640 }, { "epoch": 2.1592105263157895, "grad_norm": 0.08280728226775676, "learning_rate": 4.415688815743858e-05, "loss": 0.2296, "step": 1641 }, { "epoch": 2.1605263157894736, "grad_norm": 0.0853460546951434, "learning_rate": 4.402995019348595e-05, "loss": 0.2387, "step": 1642 }, { "epoch": 2.161842105263158, "grad_norm": 0.0866721985482825, "learning_rate": 4.390314341950581e-05, "loss": 0.2425, "step": 1643 }, { "epoch": 2.163157894736842, "grad_norm": 0.08725371319117803, "learning_rate": 4.3776468132724604e-05, "loss": 0.242, "step": 1644 }, { "epoch": 2.1644736842105265, "grad_norm": 0.08513410768610372, "learning_rate": 4.3649924630060534e-05, "loss": 0.2458, "step": 1645 }, { "epoch": 2.1657894736842107, "grad_norm": 0.08350902892663929, "learning_rate": 4.35235132081229e-05, "loss": 0.2373, "step": 1646 }, { "epoch": 2.167105263157895, "grad_norm": 0.08213110851604064, "learning_rate": 4.3397234163211483e-05, "loss": 0.2244, "step": 1647 }, { "epoch": 2.168421052631579, "grad_norm": 0.08674873213326942, "learning_rate": 4.3271087791315734e-05, "loss": 0.2425, "step": 1648 }, { "epoch": 2.169736842105263, "grad_norm": 0.08588867781512083, "learning_rate": 4.314507438811407e-05, "loss": 0.2368, "step": 1649 }, { "epoch": 2.1710526315789473, "grad_norm": 0.08387668032160955, "learning_rate": 4.301919424897338e-05, "loss": 0.2397, "step": 1650 }, { "epoch": 2.1723684210526315, "grad_norm": 0.08260437607051827, "learning_rate": 4.289344766894807e-05, "loss": 0.2323, "step": 1651 }, { "epoch": 2.1736842105263157, "grad_norm": 0.0833246040235028, "learning_rate": 4.276783494277954e-05, "loss": 0.2313, "step": 1652 }, { "epoch": 2.175, "grad_norm": 0.08291199475859631, "learning_rate": 4.264235636489542e-05, "loss": 0.2342, "step": 1653 }, { "epoch": 2.1763157894736844, "grad_norm": 0.08695364030640872, "learning_rate": 4.2517012229408905e-05, "loss": 0.2534, "step": 1654 }, { "epoch": 2.1776315789473686, "grad_norm": 0.08428816275057385, "learning_rate": 4.2391802830118135e-05, "loss": 0.2346, "step": 1655 }, { "epoch": 2.1789473684210527, "grad_norm": 0.084207037568309, "learning_rate": 4.2266728460505375e-05, "loss": 0.2381, "step": 1656 }, { "epoch": 2.180263157894737, "grad_norm": 0.0886796758669413, "learning_rate": 4.2141789413736354e-05, "loss": 0.2495, "step": 1657 }, { "epoch": 2.181578947368421, "grad_norm": 0.08307977015154046, "learning_rate": 4.201698598265973e-05, "loss": 0.2327, "step": 1658 }, { "epoch": 2.182894736842105, "grad_norm": 0.08244618117247395, "learning_rate": 4.189231845980618e-05, "loss": 0.2344, "step": 1659 }, { "epoch": 2.1842105263157894, "grad_norm": 0.08471751852248358, "learning_rate": 4.176778713738787e-05, "loss": 0.2341, "step": 1660 }, { "epoch": 2.1855263157894735, "grad_norm": 0.08538574209457075, "learning_rate": 4.164339230729771e-05, "loss": 0.2375, "step": 1661 }, { "epoch": 2.1868421052631577, "grad_norm": 0.08739644816596194, "learning_rate": 4.151913426110864e-05, "loss": 0.2357, "step": 1662 }, { "epoch": 2.1881578947368423, "grad_norm": 0.08456784004962821, "learning_rate": 4.13950132900731e-05, "loss": 0.2389, "step": 1663 }, { "epoch": 2.1894736842105265, "grad_norm": 0.08259644860196363, "learning_rate": 4.127102968512214e-05, "loss": 0.2335, "step": 1664 }, { "epoch": 2.1907894736842106, "grad_norm": 0.08038218499706021, "learning_rate": 4.114718373686481e-05, "loss": 0.233, "step": 1665 }, { "epoch": 2.192105263157895, "grad_norm": 0.08297061732865985, "learning_rate": 4.102347573558763e-05, "loss": 0.226, "step": 1666 }, { "epoch": 2.193421052631579, "grad_norm": 0.08863091885047723, "learning_rate": 4.089990597125368e-05, "loss": 0.2438, "step": 1667 }, { "epoch": 2.194736842105263, "grad_norm": 0.08628873633777266, "learning_rate": 4.077647473350201e-05, "loss": 0.2361, "step": 1668 }, { "epoch": 2.1960526315789473, "grad_norm": 0.08360640948058073, "learning_rate": 4.065318231164704e-05, "loss": 0.2417, "step": 1669 }, { "epoch": 2.1973684210526314, "grad_norm": 0.08524943894134786, "learning_rate": 4.053002899467774e-05, "loss": 0.2387, "step": 1670 }, { "epoch": 2.1986842105263156, "grad_norm": 0.08537007936359343, "learning_rate": 4.040701507125712e-05, "loss": 0.2408, "step": 1671 }, { "epoch": 2.2, "grad_norm": 0.08300365174925911, "learning_rate": 4.028414082972141e-05, "loss": 0.226, "step": 1672 }, { "epoch": 2.2013157894736843, "grad_norm": 0.08790060045013368, "learning_rate": 4.016140655807936e-05, "loss": 0.2507, "step": 1673 }, { "epoch": 2.2026315789473685, "grad_norm": 0.0831197113798778, "learning_rate": 4.003881254401183e-05, "loss": 0.2343, "step": 1674 }, { "epoch": 2.2039473684210527, "grad_norm": 0.08118714142897289, "learning_rate": 3.991635907487076e-05, "loss": 0.2265, "step": 1675 }, { "epoch": 2.205263157894737, "grad_norm": 0.08836758200078478, "learning_rate": 3.97940464376787e-05, "loss": 0.2539, "step": 1676 }, { "epoch": 2.206578947368421, "grad_norm": 0.0829126388696676, "learning_rate": 3.967187491912813e-05, "loss": 0.2343, "step": 1677 }, { "epoch": 2.207894736842105, "grad_norm": 0.08050172289656292, "learning_rate": 3.9549844805580706e-05, "loss": 0.2339, "step": 1678 }, { "epoch": 2.2092105263157893, "grad_norm": 0.08394767599321681, "learning_rate": 3.942795638306674e-05, "loss": 0.2369, "step": 1679 }, { "epoch": 2.2105263157894735, "grad_norm": 0.08711738632474018, "learning_rate": 3.9306209937284346e-05, "loss": 0.2419, "step": 1680 }, { "epoch": 2.211842105263158, "grad_norm": 0.0814409575586451, "learning_rate": 3.918460575359882e-05, "loss": 0.2306, "step": 1681 }, { "epoch": 2.213157894736842, "grad_norm": 0.08526205771283932, "learning_rate": 3.906314411704215e-05, "loss": 0.2374, "step": 1682 }, { "epoch": 2.2144736842105264, "grad_norm": 0.08590081960012944, "learning_rate": 3.8941825312312054e-05, "loss": 0.2324, "step": 1683 }, { "epoch": 2.2157894736842105, "grad_norm": 0.08577840811573154, "learning_rate": 3.882064962377154e-05, "loss": 0.2402, "step": 1684 }, { "epoch": 2.2171052631578947, "grad_norm": 0.08559702425445263, "learning_rate": 3.869961733544814e-05, "loss": 0.2402, "step": 1685 }, { "epoch": 2.218421052631579, "grad_norm": 0.08436964025823426, "learning_rate": 3.857872873103322e-05, "loss": 0.2415, "step": 1686 }, { "epoch": 2.219736842105263, "grad_norm": 0.08396240106431019, "learning_rate": 3.845798409388149e-05, "loss": 0.2302, "step": 1687 }, { "epoch": 2.221052631578947, "grad_norm": 0.08629968011623863, "learning_rate": 3.83373837070101e-05, "loss": 0.2358, "step": 1688 }, { "epoch": 2.2223684210526318, "grad_norm": 0.0838099558892786, "learning_rate": 3.821692785309807e-05, "loss": 0.2334, "step": 1689 }, { "epoch": 2.223684210526316, "grad_norm": 0.08557035987280985, "learning_rate": 3.809661681448576e-05, "loss": 0.2311, "step": 1690 }, { "epoch": 2.225, "grad_norm": 0.08591483983457045, "learning_rate": 3.7976450873174005e-05, "loss": 0.2421, "step": 1691 }, { "epoch": 2.2263157894736842, "grad_norm": 0.08701077906025538, "learning_rate": 3.7856430310823545e-05, "loss": 0.2451, "step": 1692 }, { "epoch": 2.2276315789473684, "grad_norm": 0.08152777587376067, "learning_rate": 3.773655540875438e-05, "loss": 0.2335, "step": 1693 }, { "epoch": 2.2289473684210526, "grad_norm": 0.0879771188843296, "learning_rate": 3.7616826447945066e-05, "loss": 0.2404, "step": 1694 }, { "epoch": 2.2302631578947367, "grad_norm": 0.08415415636312946, "learning_rate": 3.749724370903216e-05, "loss": 0.2355, "step": 1695 }, { "epoch": 2.231578947368421, "grad_norm": 0.08756860364801405, "learning_rate": 3.737780747230941e-05, "loss": 0.2378, "step": 1696 }, { "epoch": 2.2328947368421055, "grad_norm": 0.08454219628982126, "learning_rate": 3.725851801772715e-05, "loss": 0.2445, "step": 1697 }, { "epoch": 2.2342105263157896, "grad_norm": 0.0869901636562866, "learning_rate": 3.713937562489179e-05, "loss": 0.2296, "step": 1698 }, { "epoch": 2.235526315789474, "grad_norm": 0.08630652964552962, "learning_rate": 3.702038057306492e-05, "loss": 0.2302, "step": 1699 }, { "epoch": 2.236842105263158, "grad_norm": 0.08659038929154979, "learning_rate": 3.69015331411628e-05, "loss": 0.2351, "step": 1700 }, { "epoch": 2.238157894736842, "grad_norm": 0.08649330336962248, "learning_rate": 3.678283360775571e-05, "loss": 0.2362, "step": 1701 }, { "epoch": 2.2394736842105263, "grad_norm": 0.08424707415501553, "learning_rate": 3.6664282251067184e-05, "loss": 0.2378, "step": 1702 }, { "epoch": 2.2407894736842104, "grad_norm": 0.08338435013878512, "learning_rate": 3.65458793489736e-05, "loss": 0.2336, "step": 1703 }, { "epoch": 2.2421052631578946, "grad_norm": 0.08719248023927356, "learning_rate": 3.642762517900322e-05, "loss": 0.2426, "step": 1704 }, { "epoch": 2.2434210526315788, "grad_norm": 0.08436003020889046, "learning_rate": 3.6309520018335705e-05, "loss": 0.2301, "step": 1705 }, { "epoch": 2.2447368421052634, "grad_norm": 0.08728623567451708, "learning_rate": 3.619156414380156e-05, "loss": 0.2462, "step": 1706 }, { "epoch": 2.2460526315789475, "grad_norm": 0.08273816077006978, "learning_rate": 3.607375783188125e-05, "loss": 0.2433, "step": 1707 }, { "epoch": 2.2473684210526317, "grad_norm": 0.08380037356624727, "learning_rate": 3.595610135870472e-05, "loss": 0.2419, "step": 1708 }, { "epoch": 2.248684210526316, "grad_norm": 0.08599999255981298, "learning_rate": 3.583859500005071e-05, "loss": 0.2459, "step": 1709 }, { "epoch": 2.25, "grad_norm": 0.0857928049153299, "learning_rate": 3.5721239031346066e-05, "loss": 0.2332, "step": 1710 }, { "epoch": 2.251315789473684, "grad_norm": 0.08563816332335225, "learning_rate": 3.560403372766522e-05, "loss": 0.239, "step": 1711 }, { "epoch": 2.2526315789473683, "grad_norm": 0.08648184240333989, "learning_rate": 3.548697936372937e-05, "loss": 0.2369, "step": 1712 }, { "epoch": 2.2539473684210525, "grad_norm": 0.08405093979550926, "learning_rate": 3.53700762139059e-05, "loss": 0.2484, "step": 1713 }, { "epoch": 2.2552631578947366, "grad_norm": 0.08629123371639234, "learning_rate": 3.525332455220789e-05, "loss": 0.2526, "step": 1714 }, { "epoch": 2.2565789473684212, "grad_norm": 0.08603955512267374, "learning_rate": 3.5136724652293206e-05, "loss": 0.2363, "step": 1715 }, { "epoch": 2.2578947368421054, "grad_norm": 0.08357218310355902, "learning_rate": 3.5020276787464056e-05, "loss": 0.2379, "step": 1716 }, { "epoch": 2.2592105263157896, "grad_norm": 0.08178687870308674, "learning_rate": 3.490398123066628e-05, "loss": 0.2342, "step": 1717 }, { "epoch": 2.2605263157894737, "grad_norm": 0.08618629426674392, "learning_rate": 3.4787838254488694e-05, "loss": 0.2436, "step": 1718 }, { "epoch": 2.261842105263158, "grad_norm": 0.08654678035853532, "learning_rate": 3.4671848131162544e-05, "loss": 0.2413, "step": 1719 }, { "epoch": 2.263157894736842, "grad_norm": 0.08154103451407675, "learning_rate": 3.455601113256073e-05, "loss": 0.2445, "step": 1720 }, { "epoch": 2.264473684210526, "grad_norm": 0.08283767549879928, "learning_rate": 3.444032753019723e-05, "loss": 0.235, "step": 1721 }, { "epoch": 2.2657894736842104, "grad_norm": 0.08402970526889068, "learning_rate": 3.4324797595226565e-05, "loss": 0.2337, "step": 1722 }, { "epoch": 2.2671052631578945, "grad_norm": 0.08237791548613392, "learning_rate": 3.420942159844298e-05, "loss": 0.2352, "step": 1723 }, { "epoch": 2.268421052631579, "grad_norm": 0.08344712694132657, "learning_rate": 3.4094199810279924e-05, "loss": 0.236, "step": 1724 }, { "epoch": 2.2697368421052633, "grad_norm": 0.08799849742749684, "learning_rate": 3.3979132500809405e-05, "loss": 0.2481, "step": 1725 }, { "epoch": 2.2710526315789474, "grad_norm": 0.08588090804811839, "learning_rate": 3.386421993974129e-05, "loss": 0.2364, "step": 1726 }, { "epoch": 2.2723684210526316, "grad_norm": 0.0838270871173491, "learning_rate": 3.3749462396422846e-05, "loss": 0.228, "step": 1727 }, { "epoch": 2.2736842105263158, "grad_norm": 0.08845792107580112, "learning_rate": 3.363486013983788e-05, "loss": 0.2437, "step": 1728 }, { "epoch": 2.275, "grad_norm": 0.08775832799442573, "learning_rate": 3.352041343860621e-05, "loss": 0.2433, "step": 1729 }, { "epoch": 2.276315789473684, "grad_norm": 0.08852510695897718, "learning_rate": 3.340612256098316e-05, "loss": 0.2528, "step": 1730 }, { "epoch": 2.2776315789473682, "grad_norm": 0.09072365256400772, "learning_rate": 3.329198777485869e-05, "loss": 0.235, "step": 1731 }, { "epoch": 2.2789473684210524, "grad_norm": 0.08478552332865018, "learning_rate": 3.317800934775696e-05, "loss": 0.2419, "step": 1732 }, { "epoch": 2.280263157894737, "grad_norm": 0.08510050487380844, "learning_rate": 3.30641875468356e-05, "loss": 0.2382, "step": 1733 }, { "epoch": 2.281578947368421, "grad_norm": 0.08450032284933855, "learning_rate": 3.2950522638885106e-05, "loss": 0.2411, "step": 1734 }, { "epoch": 2.2828947368421053, "grad_norm": 0.0847645250276902, "learning_rate": 3.283701489032832e-05, "loss": 0.2289, "step": 1735 }, { "epoch": 2.2842105263157895, "grad_norm": 0.08500704575747163, "learning_rate": 3.2723664567219626e-05, "loss": 0.2362, "step": 1736 }, { "epoch": 2.2855263157894736, "grad_norm": 0.08540028460798461, "learning_rate": 3.261047193524439e-05, "loss": 0.2441, "step": 1737 }, { "epoch": 2.286842105263158, "grad_norm": 0.08482142626182317, "learning_rate": 3.249743725971849e-05, "loss": 0.2339, "step": 1738 }, { "epoch": 2.288157894736842, "grad_norm": 0.08455145109428938, "learning_rate": 3.238456080558743e-05, "loss": 0.2364, "step": 1739 }, { "epoch": 2.2894736842105265, "grad_norm": 0.08608434293062465, "learning_rate": 3.227184283742591e-05, "loss": 0.2427, "step": 1740 }, { "epoch": 2.2907894736842107, "grad_norm": 0.08525626191346178, "learning_rate": 3.2159283619437155e-05, "loss": 0.2447, "step": 1741 }, { "epoch": 2.292105263157895, "grad_norm": 0.0867443524413285, "learning_rate": 3.2046883415452246e-05, "loss": 0.2297, "step": 1742 }, { "epoch": 2.293421052631579, "grad_norm": 0.08427294151144223, "learning_rate": 3.193464248892964e-05, "loss": 0.2466, "step": 1743 }, { "epoch": 2.294736842105263, "grad_norm": 0.0849042097519616, "learning_rate": 3.182256110295437e-05, "loss": 0.2402, "step": 1744 }, { "epoch": 2.2960526315789473, "grad_norm": 0.08773216559618434, "learning_rate": 3.171063952023753e-05, "loss": 0.238, "step": 1745 }, { "epoch": 2.2973684210526315, "grad_norm": 0.08514375299514274, "learning_rate": 3.159887800311569e-05, "loss": 0.2495, "step": 1746 }, { "epoch": 2.2986842105263157, "grad_norm": 0.08221174413038916, "learning_rate": 3.148727681355022e-05, "loss": 0.2327, "step": 1747 }, { "epoch": 2.3, "grad_norm": 0.08292212985659095, "learning_rate": 3.137583621312665e-05, "loss": 0.2265, "step": 1748 }, { "epoch": 2.3013157894736844, "grad_norm": 0.08536339352006025, "learning_rate": 3.126455646305416e-05, "loss": 0.2316, "step": 1749 }, { "epoch": 2.3026315789473686, "grad_norm": 0.083425418514592, "learning_rate": 3.115343782416483e-05, "loss": 0.2405, "step": 1750 }, { "epoch": 2.3039473684210527, "grad_norm": 0.08635976351926793, "learning_rate": 3.1042480556913224e-05, "loss": 0.234, "step": 1751 }, { "epoch": 2.305263157894737, "grad_norm": 0.08648353037146853, "learning_rate": 3.093168492137557e-05, "loss": 0.222, "step": 1752 }, { "epoch": 2.306578947368421, "grad_norm": 0.08421631985285294, "learning_rate": 3.082105117724923e-05, "loss": 0.2301, "step": 1753 }, { "epoch": 2.307894736842105, "grad_norm": 0.08776858485770223, "learning_rate": 3.071057958385221e-05, "loss": 0.236, "step": 1754 }, { "epoch": 2.3092105263157894, "grad_norm": 0.08692453013595573, "learning_rate": 3.0600270400122335e-05, "loss": 0.2379, "step": 1755 }, { "epoch": 2.3105263157894735, "grad_norm": 0.08408758438701626, "learning_rate": 3.0490123884616796e-05, "loss": 0.2397, "step": 1756 }, { "epoch": 2.3118421052631577, "grad_norm": 0.08561383977421695, "learning_rate": 3.0380140295511516e-05, "loss": 0.2362, "step": 1757 }, { "epoch": 2.3131578947368423, "grad_norm": 0.08523323066228368, "learning_rate": 3.0270319890600462e-05, "loss": 0.2373, "step": 1758 }, { "epoch": 2.3144736842105265, "grad_norm": 0.08384268173924093, "learning_rate": 3.0160662927295225e-05, "loss": 0.2221, "step": 1759 }, { "epoch": 2.3157894736842106, "grad_norm": 0.08377079729896948, "learning_rate": 3.0051169662624225e-05, "loss": 0.2305, "step": 1760 }, { "epoch": 2.317105263157895, "grad_norm": 0.08434931689091195, "learning_rate": 2.994184035323213e-05, "loss": 0.2264, "step": 1761 }, { "epoch": 2.318421052631579, "grad_norm": 0.08485228565126582, "learning_rate": 2.983267525537945e-05, "loss": 0.2407, "step": 1762 }, { "epoch": 2.319736842105263, "grad_norm": 0.08458549101284824, "learning_rate": 2.9723674624941688e-05, "loss": 0.2397, "step": 1763 }, { "epoch": 2.3210526315789473, "grad_norm": 0.08376501835808486, "learning_rate": 2.9614838717408867e-05, "loss": 0.2353, "step": 1764 }, { "epoch": 2.3223684210526314, "grad_norm": 0.08574531246445859, "learning_rate": 2.950616778788492e-05, "loss": 0.2465, "step": 1765 }, { "epoch": 2.3236842105263156, "grad_norm": 0.08775545996621473, "learning_rate": 2.9397662091087054e-05, "loss": 0.2464, "step": 1766 }, { "epoch": 2.325, "grad_norm": 0.0818496814028781, "learning_rate": 2.9289321881345254e-05, "loss": 0.2204, "step": 1767 }, { "epoch": 2.3263157894736843, "grad_norm": 0.08595948586857366, "learning_rate": 2.9181147412601562e-05, "loss": 0.2329, "step": 1768 }, { "epoch": 2.3276315789473685, "grad_norm": 0.08376848170186316, "learning_rate": 2.9073138938409495e-05, "loss": 0.2416, "step": 1769 }, { "epoch": 2.3289473684210527, "grad_norm": 0.08594662597858775, "learning_rate": 2.89652967119336e-05, "loss": 0.2439, "step": 1770 }, { "epoch": 2.330263157894737, "grad_norm": 0.08548685030438735, "learning_rate": 2.8857620985948652e-05, "loss": 0.2466, "step": 1771 }, { "epoch": 2.331578947368421, "grad_norm": 0.08440124314126071, "learning_rate": 2.8750112012839214e-05, "loss": 0.2318, "step": 1772 }, { "epoch": 2.332894736842105, "grad_norm": 0.08563767620958296, "learning_rate": 2.8642770044598966e-05, "loss": 0.2386, "step": 1773 }, { "epoch": 2.3342105263157893, "grad_norm": 0.08413261927528606, "learning_rate": 2.8535595332830102e-05, "loss": 0.2412, "step": 1774 }, { "epoch": 2.3355263157894735, "grad_norm": 0.08430887282829173, "learning_rate": 2.8428588128742894e-05, "loss": 0.2433, "step": 1775 }, { "epoch": 2.336842105263158, "grad_norm": 0.08421978093619426, "learning_rate": 2.8321748683154893e-05, "loss": 0.2266, "step": 1776 }, { "epoch": 2.338157894736842, "grad_norm": 0.08367626319071061, "learning_rate": 2.8215077246490417e-05, "loss": 0.232, "step": 1777 }, { "epoch": 2.3394736842105264, "grad_norm": 0.0852480773032482, "learning_rate": 2.810857406878009e-05, "loss": 0.2431, "step": 1778 }, { "epoch": 2.3407894736842105, "grad_norm": 0.08752279412974256, "learning_rate": 2.800223939966007e-05, "loss": 0.2392, "step": 1779 }, { "epoch": 2.3421052631578947, "grad_norm": 0.08525332318728705, "learning_rate": 2.789607348837153e-05, "loss": 0.2261, "step": 1780 }, { "epoch": 2.343421052631579, "grad_norm": 0.08501669465697845, "learning_rate": 2.7790076583760126e-05, "loss": 0.236, "step": 1781 }, { "epoch": 2.344736842105263, "grad_norm": 0.08586501169000936, "learning_rate": 2.7684248934275325e-05, "loss": 0.2369, "step": 1782 }, { "epoch": 2.3460526315789476, "grad_norm": 0.08330526797227158, "learning_rate": 2.757859078796997e-05, "loss": 0.2426, "step": 1783 }, { "epoch": 2.3473684210526318, "grad_norm": 0.08682682618319912, "learning_rate": 2.7473102392499518e-05, "loss": 0.2405, "step": 1784 }, { "epoch": 2.348684210526316, "grad_norm": 0.08681140965019736, "learning_rate": 2.73677839951215e-05, "loss": 0.2438, "step": 1785 }, { "epoch": 2.35, "grad_norm": 0.08557189396659094, "learning_rate": 2.7262635842695127e-05, "loss": 0.232, "step": 1786 }, { "epoch": 2.3513157894736842, "grad_norm": 0.08357509250185297, "learning_rate": 2.7157658181680457e-05, "loss": 0.2391, "step": 1787 }, { "epoch": 2.3526315789473684, "grad_norm": 0.0863037240334069, "learning_rate": 2.7052851258137935e-05, "loss": 0.2385, "step": 1788 }, { "epoch": 2.3539473684210526, "grad_norm": 0.08781831219541067, "learning_rate": 2.6948215317727844e-05, "loss": 0.2332, "step": 1789 }, { "epoch": 2.3552631578947367, "grad_norm": 0.08657991059589279, "learning_rate": 2.684375060570965e-05, "loss": 0.2386, "step": 1790 }, { "epoch": 2.356578947368421, "grad_norm": 0.08748510488863394, "learning_rate": 2.6739457366941543e-05, "loss": 0.2449, "step": 1791 }, { "epoch": 2.3578947368421055, "grad_norm": 0.08693207168372159, "learning_rate": 2.6635335845879737e-05, "loss": 0.2294, "step": 1792 }, { "epoch": 2.3592105263157896, "grad_norm": 0.0867096331580623, "learning_rate": 2.653138628657793e-05, "loss": 0.2375, "step": 1793 }, { "epoch": 2.360526315789474, "grad_norm": 0.09001793260204524, "learning_rate": 2.6427608932686843e-05, "loss": 0.2479, "step": 1794 }, { "epoch": 2.361842105263158, "grad_norm": 0.08716676015459905, "learning_rate": 2.6324004027453464e-05, "loss": 0.2371, "step": 1795 }, { "epoch": 2.363157894736842, "grad_norm": 0.08978495624040321, "learning_rate": 2.622057181372063e-05, "loss": 0.2468, "step": 1796 }, { "epoch": 2.3644736842105263, "grad_norm": 0.08299772256887776, "learning_rate": 2.6117312533926362e-05, "loss": 0.2336, "step": 1797 }, { "epoch": 2.3657894736842104, "grad_norm": 0.08607109549303381, "learning_rate": 2.601422643010335e-05, "loss": 0.2424, "step": 1798 }, { "epoch": 2.3671052631578946, "grad_norm": 0.08507498429800535, "learning_rate": 2.5911313743878418e-05, "loss": 0.2385, "step": 1799 }, { "epoch": 2.3684210526315788, "grad_norm": 0.08326170794330705, "learning_rate": 2.5808574716471856e-05, "loss": 0.2419, "step": 1800 }, { "epoch": 2.3697368421052634, "grad_norm": 0.08536319126880634, "learning_rate": 2.570600958869689e-05, "loss": 0.2337, "step": 1801 }, { "epoch": 2.3710526315789475, "grad_norm": 0.0832285855296963, "learning_rate": 2.5603618600959223e-05, "loss": 0.2262, "step": 1802 }, { "epoch": 2.3723684210526317, "grad_norm": 0.08748256088729627, "learning_rate": 2.55014019932563e-05, "loss": 0.238, "step": 1803 }, { "epoch": 2.373684210526316, "grad_norm": 0.0845403743408863, "learning_rate": 2.5399360005176886e-05, "loss": 0.2342, "step": 1804 }, { "epoch": 2.375, "grad_norm": 0.08306478491889778, "learning_rate": 2.529749287590042e-05, "loss": 0.236, "step": 1805 }, { "epoch": 2.376315789473684, "grad_norm": 0.08506256922386096, "learning_rate": 2.519580084419646e-05, "loss": 0.2364, "step": 1806 }, { "epoch": 2.3776315789473683, "grad_norm": 0.0821793026467122, "learning_rate": 2.509428414842424e-05, "loss": 0.2241, "step": 1807 }, { "epoch": 2.3789473684210525, "grad_norm": 0.08556195534896847, "learning_rate": 2.4992943026531935e-05, "loss": 0.2365, "step": 1808 }, { "epoch": 2.3802631578947366, "grad_norm": 0.08705793209497273, "learning_rate": 2.4891777716056176e-05, "loss": 0.2526, "step": 1809 }, { "epoch": 2.3815789473684212, "grad_norm": 0.08705293536857067, "learning_rate": 2.4790788454121584e-05, "loss": 0.2361, "step": 1810 }, { "epoch": 2.3828947368421054, "grad_norm": 0.08488258217639373, "learning_rate": 2.4689975477440086e-05, "loss": 0.2273, "step": 1811 }, { "epoch": 2.3842105263157896, "grad_norm": 0.09044220228981603, "learning_rate": 2.4589339022310386e-05, "loss": 0.2532, "step": 1812 }, { "epoch": 2.3855263157894737, "grad_norm": 0.0858040156345174, "learning_rate": 2.4488879324617474e-05, "loss": 0.2395, "step": 1813 }, { "epoch": 2.386842105263158, "grad_norm": 0.08540339576774533, "learning_rate": 2.4388596619831993e-05, "loss": 0.2393, "step": 1814 }, { "epoch": 2.388157894736842, "grad_norm": 0.08684868336556462, "learning_rate": 2.4288491143009795e-05, "loss": 0.2362, "step": 1815 }, { "epoch": 2.389473684210526, "grad_norm": 0.086881482942677, "learning_rate": 2.4188563128791254e-05, "loss": 0.2352, "step": 1816 }, { "epoch": 2.3907894736842104, "grad_norm": 0.08698301322688715, "learning_rate": 2.4088812811400773e-05, "loss": 0.2366, "step": 1817 }, { "epoch": 2.3921052631578945, "grad_norm": 0.08860609435080301, "learning_rate": 2.3989240424646355e-05, "loss": 0.2579, "step": 1818 }, { "epoch": 2.393421052631579, "grad_norm": 0.08501020836673648, "learning_rate": 2.388984620191883e-05, "loss": 0.2305, "step": 1819 }, { "epoch": 2.3947368421052633, "grad_norm": 0.08510368650070274, "learning_rate": 2.379063037619146e-05, "loss": 0.247, "step": 1820 }, { "epoch": 2.3960526315789474, "grad_norm": 0.0829406506692919, "learning_rate": 2.3691593180019366e-05, "loss": 0.2278, "step": 1821 }, { "epoch": 2.3973684210526316, "grad_norm": 0.0867391319954807, "learning_rate": 2.3592734845538956e-05, "loss": 0.2336, "step": 1822 }, { "epoch": 2.3986842105263158, "grad_norm": 0.08541140823531897, "learning_rate": 2.3494055604467447e-05, "loss": 0.2492, "step": 1823 }, { "epoch": 2.4, "grad_norm": 0.08302602754112574, "learning_rate": 2.339555568810221e-05, "loss": 0.2373, "step": 1824 }, { "epoch": 2.401315789473684, "grad_norm": 0.08577260892930051, "learning_rate": 2.32972353273203e-05, "loss": 0.2419, "step": 1825 }, { "epoch": 2.4026315789473682, "grad_norm": 0.08774292891845863, "learning_rate": 2.319909475257799e-05, "loss": 0.2405, "step": 1826 }, { "epoch": 2.4039473684210524, "grad_norm": 0.08550920787272853, "learning_rate": 2.3101134193910024e-05, "loss": 0.2434, "step": 1827 }, { "epoch": 2.405263157894737, "grad_norm": 0.08426612023984284, "learning_rate": 2.300335388092929e-05, "loss": 0.2234, "step": 1828 }, { "epoch": 2.406578947368421, "grad_norm": 0.0848436411883103, "learning_rate": 2.2905754042826143e-05, "loss": 0.2377, "step": 1829 }, { "epoch": 2.4078947368421053, "grad_norm": 0.08552189669750118, "learning_rate": 2.2808334908367914e-05, "loss": 0.243, "step": 1830 }, { "epoch": 2.4092105263157895, "grad_norm": 0.08572236225629354, "learning_rate": 2.271109670589844e-05, "loss": 0.2398, "step": 1831 }, { "epoch": 2.4105263157894736, "grad_norm": 0.08511556223223665, "learning_rate": 2.2614039663337417e-05, "loss": 0.225, "step": 1832 }, { "epoch": 2.411842105263158, "grad_norm": 0.08510989348259673, "learning_rate": 2.2517164008179882e-05, "loss": 0.2393, "step": 1833 }, { "epoch": 2.413157894736842, "grad_norm": 0.08621647138074637, "learning_rate": 2.2420469967495793e-05, "loss": 0.2492, "step": 1834 }, { "epoch": 2.4144736842105265, "grad_norm": 0.08627144976057537, "learning_rate": 2.232395776792938e-05, "loss": 0.2351, "step": 1835 }, { "epoch": 2.4157894736842107, "grad_norm": 0.08449582717174542, "learning_rate": 2.222762763569862e-05, "loss": 0.2331, "step": 1836 }, { "epoch": 2.417105263157895, "grad_norm": 0.08533743090482276, "learning_rate": 2.2131479796594767e-05, "loss": 0.2356, "step": 1837 }, { "epoch": 2.418421052631579, "grad_norm": 0.08874843333228837, "learning_rate": 2.2035514475981756e-05, "loss": 0.242, "step": 1838 }, { "epoch": 2.419736842105263, "grad_norm": 0.08529644318540644, "learning_rate": 2.1939731898795802e-05, "loss": 0.2239, "step": 1839 }, { "epoch": 2.4210526315789473, "grad_norm": 0.08594783838232867, "learning_rate": 2.184413228954468e-05, "loss": 0.2411, "step": 1840 }, { "epoch": 2.4223684210526315, "grad_norm": 0.09034497841922952, "learning_rate": 2.1748715872307345e-05, "loss": 0.2437, "step": 1841 }, { "epoch": 2.4236842105263157, "grad_norm": 0.0864543032403873, "learning_rate": 2.165348287073339e-05, "loss": 0.2497, "step": 1842 }, { "epoch": 2.425, "grad_norm": 0.08423651720196439, "learning_rate": 2.155843350804243e-05, "loss": 0.233, "step": 1843 }, { "epoch": 2.4263157894736844, "grad_norm": 0.08476088806860947, "learning_rate": 2.1463568007023704e-05, "loss": 0.2338, "step": 1844 }, { "epoch": 2.4276315789473686, "grad_norm": 0.08560695927731492, "learning_rate": 2.1368886590035443e-05, "loss": 0.2474, "step": 1845 }, { "epoch": 2.4289473684210527, "grad_norm": 0.08571656417086615, "learning_rate": 2.1274389479004397e-05, "loss": 0.2343, "step": 1846 }, { "epoch": 2.430263157894737, "grad_norm": 0.08484405706280464, "learning_rate": 2.1180076895425395e-05, "loss": 0.2358, "step": 1847 }, { "epoch": 2.431578947368421, "grad_norm": 0.09045256351090854, "learning_rate": 2.1085949060360654e-05, "loss": 0.241, "step": 1848 }, { "epoch": 2.432894736842105, "grad_norm": 0.08339645141686791, "learning_rate": 2.0992006194439372e-05, "loss": 0.2417, "step": 1849 }, { "epoch": 2.4342105263157894, "grad_norm": 0.08857069700987559, "learning_rate": 2.0898248517857256e-05, "loss": 0.2281, "step": 1850 }, { "epoch": 2.4355263157894735, "grad_norm": 0.08727223363474927, "learning_rate": 2.0804676250375867e-05, "loss": 0.2408, "step": 1851 }, { "epoch": 2.4368421052631577, "grad_norm": 0.08636083640113454, "learning_rate": 2.0711289611322204e-05, "loss": 0.2357, "step": 1852 }, { "epoch": 2.4381578947368423, "grad_norm": 0.08601921483257845, "learning_rate": 2.0618088819588167e-05, "loss": 0.2362, "step": 1853 }, { "epoch": 2.4394736842105265, "grad_norm": 0.08573635555006953, "learning_rate": 2.0525074093630036e-05, "loss": 0.2382, "step": 1854 }, { "epoch": 2.4407894736842106, "grad_norm": 0.08431162925824574, "learning_rate": 2.0432245651467995e-05, "loss": 0.2336, "step": 1855 }, { "epoch": 2.442105263157895, "grad_norm": 0.08468414851359736, "learning_rate": 2.033960371068557e-05, "loss": 0.2217, "step": 1856 }, { "epoch": 2.443421052631579, "grad_norm": 0.08472832758486493, "learning_rate": 2.02471484884291e-05, "loss": 0.2378, "step": 1857 }, { "epoch": 2.444736842105263, "grad_norm": 0.08936877382897962, "learning_rate": 2.0154880201407367e-05, "loss": 0.2449, "step": 1858 }, { "epoch": 2.4460526315789473, "grad_norm": 0.0841329121430106, "learning_rate": 2.0062799065890904e-05, "loss": 0.2318, "step": 1859 }, { "epoch": 2.4473684210526314, "grad_norm": 0.08665657400105448, "learning_rate": 1.9970905297711606e-05, "loss": 0.2404, "step": 1860 }, { "epoch": 2.4486842105263156, "grad_norm": 0.08891275836559771, "learning_rate": 1.987919911226217e-05, "loss": 0.237, "step": 1861 }, { "epoch": 2.45, "grad_norm": 0.08586405744449396, "learning_rate": 1.9787680724495617e-05, "loss": 0.2316, "step": 1862 }, { "epoch": 2.4513157894736843, "grad_norm": 0.08602563906517424, "learning_rate": 1.969635034892485e-05, "loss": 0.2357, "step": 1863 }, { "epoch": 2.4526315789473685, "grad_norm": 0.08605411678673527, "learning_rate": 1.9605208199621995e-05, "loss": 0.2368, "step": 1864 }, { "epoch": 2.4539473684210527, "grad_norm": 0.0847811039119867, "learning_rate": 1.9514254490218e-05, "loss": 0.2278, "step": 1865 }, { "epoch": 2.455263157894737, "grad_norm": 0.089108249737477, "learning_rate": 1.9423489433902186e-05, "loss": 0.2421, "step": 1866 }, { "epoch": 2.456578947368421, "grad_norm": 0.08853862535589128, "learning_rate": 1.9332913243421634e-05, "loss": 0.2346, "step": 1867 }, { "epoch": 2.457894736842105, "grad_norm": 0.08624747222459084, "learning_rate": 1.924252613108073e-05, "loss": 0.2385, "step": 1868 }, { "epoch": 2.4592105263157893, "grad_norm": 0.08633100421050345, "learning_rate": 1.9152328308740707e-05, "loss": 0.2281, "step": 1869 }, { "epoch": 2.4605263157894735, "grad_norm": 0.0823894382169209, "learning_rate": 1.9062319987819067e-05, "loss": 0.2313, "step": 1870 }, { "epoch": 2.461842105263158, "grad_norm": 0.08514975111544096, "learning_rate": 1.897250137928921e-05, "loss": 0.2301, "step": 1871 }, { "epoch": 2.463157894736842, "grad_norm": 0.08573391873752553, "learning_rate": 1.888287269367979e-05, "loss": 0.2253, "step": 1872 }, { "epoch": 2.4644736842105264, "grad_norm": 0.08763954109623806, "learning_rate": 1.8793434141074295e-05, "loss": 0.2446, "step": 1873 }, { "epoch": 2.4657894736842105, "grad_norm": 0.0848193021797019, "learning_rate": 1.870418593111064e-05, "loss": 0.2421, "step": 1874 }, { "epoch": 2.4671052631578947, "grad_norm": 0.08299936916001775, "learning_rate": 1.861512827298051e-05, "loss": 0.2377, "step": 1875 }, { "epoch": 2.468421052631579, "grad_norm": 0.08656696401202549, "learning_rate": 1.8526261375428955e-05, "loss": 0.2453, "step": 1876 }, { "epoch": 2.469736842105263, "grad_norm": 0.0845516003224581, "learning_rate": 1.8437585446753925e-05, "loss": 0.2264, "step": 1877 }, { "epoch": 2.4710526315789476, "grad_norm": 0.08820431143873694, "learning_rate": 1.834910069480571e-05, "loss": 0.2553, "step": 1878 }, { "epoch": 2.4723684210526318, "grad_norm": 0.086680366075887, "learning_rate": 1.826080732698656e-05, "loss": 0.2332, "step": 1879 }, { "epoch": 2.473684210526316, "grad_norm": 0.08757999419788688, "learning_rate": 1.8172705550250092e-05, "loss": 0.2365, "step": 1880 }, { "epoch": 2.475, "grad_norm": 0.0819273273150161, "learning_rate": 1.808479557110081e-05, "loss": 0.2222, "step": 1881 }, { "epoch": 2.4763157894736842, "grad_norm": 0.08333326519630053, "learning_rate": 1.799707759559376e-05, "loss": 0.2393, "step": 1882 }, { "epoch": 2.4776315789473684, "grad_norm": 0.08611320760984689, "learning_rate": 1.790955182933385e-05, "loss": 0.2268, "step": 1883 }, { "epoch": 2.4789473684210526, "grad_norm": 0.08633530140680602, "learning_rate": 1.7822218477475494e-05, "loss": 0.236, "step": 1884 }, { "epoch": 2.4802631578947367, "grad_norm": 0.08652135837244138, "learning_rate": 1.7735077744722107e-05, "loss": 0.2351, "step": 1885 }, { "epoch": 2.481578947368421, "grad_norm": 0.08815435855670466, "learning_rate": 1.7648129835325587e-05, "loss": 0.2347, "step": 1886 }, { "epoch": 2.4828947368421055, "grad_norm": 0.0879883933592944, "learning_rate": 1.756137495308594e-05, "loss": 0.2328, "step": 1887 }, { "epoch": 2.4842105263157896, "grad_norm": 0.08643787173876677, "learning_rate": 1.7474813301350666e-05, "loss": 0.2387, "step": 1888 }, { "epoch": 2.485526315789474, "grad_norm": 0.08893052591767589, "learning_rate": 1.7388445083014325e-05, "loss": 0.2428, "step": 1889 }, { "epoch": 2.486842105263158, "grad_norm": 0.08536802381106222, "learning_rate": 1.7302270500518182e-05, "loss": 0.2375, "step": 1890 }, { "epoch": 2.488157894736842, "grad_norm": 0.08703152842078288, "learning_rate": 1.7216289755849525e-05, "loss": 0.2308, "step": 1891 }, { "epoch": 2.4894736842105263, "grad_norm": 0.0836327066485613, "learning_rate": 1.7130503050541368e-05, "loss": 0.2363, "step": 1892 }, { "epoch": 2.4907894736842104, "grad_norm": 0.08521480449342837, "learning_rate": 1.704491058567187e-05, "loss": 0.2371, "step": 1893 }, { "epoch": 2.4921052631578946, "grad_norm": 0.08544333339590796, "learning_rate": 1.6959512561863912e-05, "loss": 0.2376, "step": 1894 }, { "epoch": 2.4934210526315788, "grad_norm": 0.08419989528831981, "learning_rate": 1.6874309179284664e-05, "loss": 0.233, "step": 1895 }, { "epoch": 2.4947368421052634, "grad_norm": 0.08743178517536454, "learning_rate": 1.6789300637645e-05, "loss": 0.24, "step": 1896 }, { "epoch": 2.4960526315789475, "grad_norm": 0.08627446891742664, "learning_rate": 1.670448713619913e-05, "loss": 0.2418, "step": 1897 }, { "epoch": 2.4973684210526317, "grad_norm": 0.086520690921376, "learning_rate": 1.6619868873744147e-05, "loss": 0.2333, "step": 1898 }, { "epoch": 2.498684210526316, "grad_norm": 0.0890053931742651, "learning_rate": 1.653544604861945e-05, "loss": 0.245, "step": 1899 }, { "epoch": 2.5, "grad_norm": 0.08795310211457955, "learning_rate": 1.6451218858706374e-05, "loss": 0.2424, "step": 1900 }, { "epoch": 2.501315789473684, "grad_norm": 0.08768860420319245, "learning_rate": 1.6367187501427685e-05, "loss": 0.2409, "step": 1901 }, { "epoch": 2.5026315789473683, "grad_norm": 0.08433281148019468, "learning_rate": 1.6283352173747145e-05, "loss": 0.2332, "step": 1902 }, { "epoch": 2.5039473684210525, "grad_norm": 0.08869485067176774, "learning_rate": 1.6199713072169053e-05, "loss": 0.247, "step": 1903 }, { "epoch": 2.5052631578947366, "grad_norm": 0.08535593567657823, "learning_rate": 1.6116270392737754e-05, "loss": 0.2331, "step": 1904 }, { "epoch": 2.5065789473684212, "grad_norm": 0.08375689041638204, "learning_rate": 1.6033024331037138e-05, "loss": 0.2275, "step": 1905 }, { "epoch": 2.5078947368421054, "grad_norm": 0.08478044933594917, "learning_rate": 1.5949975082190337e-05, "loss": 0.2339, "step": 1906 }, { "epoch": 2.5092105263157896, "grad_norm": 0.08731008917029157, "learning_rate": 1.5867122840859117e-05, "loss": 0.2345, "step": 1907 }, { "epoch": 2.5105263157894737, "grad_norm": 0.0867834465835224, "learning_rate": 1.578446780124344e-05, "loss": 0.2306, "step": 1908 }, { "epoch": 2.511842105263158, "grad_norm": 0.08484337640412591, "learning_rate": 1.570201015708108e-05, "loss": 0.2421, "step": 1909 }, { "epoch": 2.513157894736842, "grad_norm": 0.08676705310204419, "learning_rate": 1.5619750101647114e-05, "loss": 0.2431, "step": 1910 }, { "epoch": 2.514473684210526, "grad_norm": 0.08560817073143388, "learning_rate": 1.553768782775351e-05, "loss": 0.2333, "step": 1911 }, { "epoch": 2.515789473684211, "grad_norm": 0.08577547460347472, "learning_rate": 1.5455823527748626e-05, "loss": 0.2372, "step": 1912 }, { "epoch": 2.5171052631578945, "grad_norm": 0.08601962847067497, "learning_rate": 1.5374157393516764e-05, "loss": 0.2334, "step": 1913 }, { "epoch": 2.518421052631579, "grad_norm": 0.08618206990680746, "learning_rate": 1.5292689616477806e-05, "loss": 0.2355, "step": 1914 }, { "epoch": 2.5197368421052633, "grad_norm": 0.08561559936263621, "learning_rate": 1.5211420387586638e-05, "loss": 0.2397, "step": 1915 }, { "epoch": 2.5210526315789474, "grad_norm": 0.08569149401048583, "learning_rate": 1.5130349897332763e-05, "loss": 0.2385, "step": 1916 }, { "epoch": 2.5223684210526316, "grad_norm": 0.08528987545012465, "learning_rate": 1.5049478335739886e-05, "loss": 0.2422, "step": 1917 }, { "epoch": 2.5236842105263158, "grad_norm": 0.08573039796052453, "learning_rate": 1.49688058923654e-05, "loss": 0.2346, "step": 1918 }, { "epoch": 2.525, "grad_norm": 0.08483858064921189, "learning_rate": 1.4888332756300027e-05, "loss": 0.2335, "step": 1919 }, { "epoch": 2.526315789473684, "grad_norm": 0.08618837736943744, "learning_rate": 1.4808059116167305e-05, "loss": 0.2321, "step": 1920 }, { "epoch": 2.5276315789473687, "grad_norm": 0.08306460927365213, "learning_rate": 1.4727985160123114e-05, "loss": 0.2322, "step": 1921 }, { "epoch": 2.5289473684210524, "grad_norm": 0.08847498270917728, "learning_rate": 1.4648111075855398e-05, "loss": 0.238, "step": 1922 }, { "epoch": 2.530263157894737, "grad_norm": 0.08664647691974588, "learning_rate": 1.4568437050583517e-05, "loss": 0.2329, "step": 1923 }, { "epoch": 2.531578947368421, "grad_norm": 0.08925646785401711, "learning_rate": 1.4488963271057943e-05, "loss": 0.2447, "step": 1924 }, { "epoch": 2.5328947368421053, "grad_norm": 0.0858422178404279, "learning_rate": 1.44096899235598e-05, "loss": 0.2339, "step": 1925 }, { "epoch": 2.5342105263157895, "grad_norm": 0.085698190947571, "learning_rate": 1.4330617193900364e-05, "loss": 0.2329, "step": 1926 }, { "epoch": 2.5355263157894736, "grad_norm": 0.08849510463317915, "learning_rate": 1.4251745267420757e-05, "loss": 0.236, "step": 1927 }, { "epoch": 2.536842105263158, "grad_norm": 0.08642342384169846, "learning_rate": 1.4173074328991377e-05, "loss": 0.2292, "step": 1928 }, { "epoch": 2.538157894736842, "grad_norm": 0.08593496440244164, "learning_rate": 1.4094604563011472e-05, "loss": 0.2308, "step": 1929 }, { "epoch": 2.5394736842105265, "grad_norm": 0.08537576961097464, "learning_rate": 1.4016336153408893e-05, "loss": 0.2318, "step": 1930 }, { "epoch": 2.5407894736842103, "grad_norm": 0.08435630547007106, "learning_rate": 1.3938269283639394e-05, "loss": 0.2244, "step": 1931 }, { "epoch": 2.542105263157895, "grad_norm": 0.0894275649268176, "learning_rate": 1.3860404136686411e-05, "loss": 0.2386, "step": 1932 }, { "epoch": 2.543421052631579, "grad_norm": 0.08731027043438779, "learning_rate": 1.3782740895060497e-05, "loss": 0.2438, "step": 1933 }, { "epoch": 2.544736842105263, "grad_norm": 0.08677749818948599, "learning_rate": 1.3705279740798993e-05, "loss": 0.2383, "step": 1934 }, { "epoch": 2.5460526315789473, "grad_norm": 0.08591491161614231, "learning_rate": 1.3628020855465572e-05, "loss": 0.2387, "step": 1935 }, { "epoch": 2.5473684210526315, "grad_norm": 0.08575789115259161, "learning_rate": 1.355096442014977e-05, "loss": 0.2325, "step": 1936 }, { "epoch": 2.5486842105263157, "grad_norm": 0.08651010282289677, "learning_rate": 1.3474110615466583e-05, "loss": 0.2279, "step": 1937 }, { "epoch": 2.55, "grad_norm": 0.08869408109954365, "learning_rate": 1.339745962155613e-05, "loss": 0.2351, "step": 1938 }, { "epoch": 2.5513157894736844, "grad_norm": 0.08792931672329181, "learning_rate": 1.3321011618083079e-05, "loss": 0.2439, "step": 1939 }, { "epoch": 2.5526315789473686, "grad_norm": 0.08816455279822999, "learning_rate": 1.3244766784236307e-05, "loss": 0.2429, "step": 1940 }, { "epoch": 2.5539473684210527, "grad_norm": 0.08652728797542152, "learning_rate": 1.3168725298728524e-05, "loss": 0.2322, "step": 1941 }, { "epoch": 2.555263157894737, "grad_norm": 0.08646822510404321, "learning_rate": 1.3092887339795734e-05, "loss": 0.2448, "step": 1942 }, { "epoch": 2.556578947368421, "grad_norm": 0.08827748125160723, "learning_rate": 1.3017253085196979e-05, "loss": 0.2432, "step": 1943 }, { "epoch": 2.557894736842105, "grad_norm": 0.087400695177381, "learning_rate": 1.294182271221377e-05, "loss": 0.242, "step": 1944 }, { "epoch": 2.5592105263157894, "grad_norm": 0.08476067656041161, "learning_rate": 1.2866596397649721e-05, "loss": 0.2332, "step": 1945 }, { "epoch": 2.5605263157894735, "grad_norm": 0.08400036515218953, "learning_rate": 1.2791574317830213e-05, "loss": 0.2335, "step": 1946 }, { "epoch": 2.5618421052631577, "grad_norm": 0.08714320110806623, "learning_rate": 1.2716756648601857e-05, "loss": 0.2384, "step": 1947 }, { "epoch": 2.5631578947368423, "grad_norm": 0.08718032193532914, "learning_rate": 1.2642143565332154e-05, "loss": 0.2363, "step": 1948 }, { "epoch": 2.5644736842105265, "grad_norm": 0.08459275476172294, "learning_rate": 1.2567735242909074e-05, "loss": 0.2226, "step": 1949 }, { "epoch": 2.5657894736842106, "grad_norm": 0.08586849969458207, "learning_rate": 1.2493531855740625e-05, "loss": 0.2247, "step": 1950 }, { "epoch": 2.567105263157895, "grad_norm": 0.08846213083149775, "learning_rate": 1.2419533577754528e-05, "loss": 0.2333, "step": 1951 }, { "epoch": 2.568421052631579, "grad_norm": 0.08643152306184454, "learning_rate": 1.2345740582397648e-05, "loss": 0.2371, "step": 1952 }, { "epoch": 2.569736842105263, "grad_norm": 0.08661338412832961, "learning_rate": 1.2272153042635704e-05, "loss": 0.2434, "step": 1953 }, { "epoch": 2.5710526315789473, "grad_norm": 0.08897240710261847, "learning_rate": 1.2198771130952913e-05, "loss": 0.2376, "step": 1954 }, { "epoch": 2.5723684210526314, "grad_norm": 0.08622184838413748, "learning_rate": 1.2125595019351443e-05, "loss": 0.2307, "step": 1955 }, { "epoch": 2.5736842105263156, "grad_norm": 0.0907063460852498, "learning_rate": 1.2052624879351104e-05, "loss": 0.2313, "step": 1956 }, { "epoch": 2.575, "grad_norm": 0.08625315493284626, "learning_rate": 1.1979860881988902e-05, "loss": 0.2383, "step": 1957 }, { "epoch": 2.5763157894736843, "grad_norm": 0.08964244854263959, "learning_rate": 1.1907303197818665e-05, "loss": 0.2394, "step": 1958 }, { "epoch": 2.5776315789473685, "grad_norm": 0.08330532747614758, "learning_rate": 1.183495199691068e-05, "loss": 0.2215, "step": 1959 }, { "epoch": 2.5789473684210527, "grad_norm": 0.0884851429344919, "learning_rate": 1.176280744885121e-05, "loss": 0.2423, "step": 1960 }, { "epoch": 2.580263157894737, "grad_norm": 0.08980407385361433, "learning_rate": 1.1690869722742126e-05, "loss": 0.2393, "step": 1961 }, { "epoch": 2.581578947368421, "grad_norm": 0.08634666525024143, "learning_rate": 1.1619138987200562e-05, "loss": 0.2287, "step": 1962 }, { "epoch": 2.582894736842105, "grad_norm": 0.08589844097584014, "learning_rate": 1.154761541035847e-05, "loss": 0.2353, "step": 1963 }, { "epoch": 2.5842105263157897, "grad_norm": 0.08695619199252902, "learning_rate": 1.1476299159862203e-05, "loss": 0.2424, "step": 1964 }, { "epoch": 2.5855263157894735, "grad_norm": 0.08435657255170038, "learning_rate": 1.1405190402872202e-05, "loss": 0.2308, "step": 1965 }, { "epoch": 2.586842105263158, "grad_norm": 0.08681956736459753, "learning_rate": 1.1334289306062495e-05, "loss": 0.2387, "step": 1966 }, { "epoch": 2.588157894736842, "grad_norm": 0.08650224803441879, "learning_rate": 1.126359603562045e-05, "loss": 0.2369, "step": 1967 }, { "epoch": 2.5894736842105264, "grad_norm": 0.08441575091576375, "learning_rate": 1.119311075724625e-05, "loss": 0.2195, "step": 1968 }, { "epoch": 2.5907894736842105, "grad_norm": 0.08371697252642783, "learning_rate": 1.1122833636152563e-05, "loss": 0.2174, "step": 1969 }, { "epoch": 2.5921052631578947, "grad_norm": 0.0868737302053953, "learning_rate": 1.1052764837064178e-05, "loss": 0.2283, "step": 1970 }, { "epoch": 2.593421052631579, "grad_norm": 0.08687526871015291, "learning_rate": 1.0982904524217551e-05, "loss": 0.2291, "step": 1971 }, { "epoch": 2.594736842105263, "grad_norm": 0.08813319422928485, "learning_rate": 1.09132528613605e-05, "loss": 0.2449, "step": 1972 }, { "epoch": 2.5960526315789476, "grad_norm": 0.08856476761244737, "learning_rate": 1.0843810011751766e-05, "loss": 0.2422, "step": 1973 }, { "epoch": 2.5973684210526313, "grad_norm": 0.08769322323788943, "learning_rate": 1.0774576138160597e-05, "loss": 0.245, "step": 1974 }, { "epoch": 2.598684210526316, "grad_norm": 0.0854386832496123, "learning_rate": 1.070555140286652e-05, "loss": 0.2269, "step": 1975 }, { "epoch": 2.6, "grad_norm": 0.08380866243911338, "learning_rate": 1.0636735967658784e-05, "loss": 0.239, "step": 1976 }, { "epoch": 2.6013157894736842, "grad_norm": 0.08479138359498807, "learning_rate": 1.056812999383604e-05, "loss": 0.2321, "step": 1977 }, { "epoch": 2.6026315789473684, "grad_norm": 0.08709294911680168, "learning_rate": 1.0499733642206033e-05, "loss": 0.239, "step": 1978 }, { "epoch": 2.6039473684210526, "grad_norm": 0.08505697002044242, "learning_rate": 1.0431547073085135e-05, "loss": 0.2314, "step": 1979 }, { "epoch": 2.6052631578947367, "grad_norm": 0.08770454123258202, "learning_rate": 1.0363570446297999e-05, "loss": 0.2455, "step": 1980 }, { "epoch": 2.606578947368421, "grad_norm": 0.08477772700319303, "learning_rate": 1.0295803921177182e-05, "loss": 0.2277, "step": 1981 }, { "epoch": 2.6078947368421055, "grad_norm": 0.08575690009081015, "learning_rate": 1.0228247656562795e-05, "loss": 0.2352, "step": 1982 }, { "epoch": 2.609210526315789, "grad_norm": 0.0852766947310931, "learning_rate": 1.0160901810802115e-05, "loss": 0.2379, "step": 1983 }, { "epoch": 2.610526315789474, "grad_norm": 0.08360204366625848, "learning_rate": 1.0093766541749205e-05, "loss": 0.2266, "step": 1984 }, { "epoch": 2.611842105263158, "grad_norm": 0.08424512848837429, "learning_rate": 1.0026842006764526e-05, "loss": 0.2307, "step": 1985 }, { "epoch": 2.613157894736842, "grad_norm": 0.08406138879126103, "learning_rate": 9.960128362714637e-06, "loss": 0.2293, "step": 1986 }, { "epoch": 2.6144736842105263, "grad_norm": 0.08644831073662886, "learning_rate": 9.89362576597177e-06, "loss": 0.2334, "step": 1987 }, { "epoch": 2.6157894736842104, "grad_norm": 0.08623901530925424, "learning_rate": 9.827334372413444e-06, "loss": 0.2358, "step": 1988 }, { "epoch": 2.6171052631578946, "grad_norm": 0.08269859922491032, "learning_rate": 9.761254337422176e-06, "loss": 0.2167, "step": 1989 }, { "epoch": 2.6184210526315788, "grad_norm": 0.08644342262890786, "learning_rate": 9.695385815885016e-06, "loss": 0.2301, "step": 1990 }, { "epoch": 2.6197368421052634, "grad_norm": 0.08629959299387181, "learning_rate": 9.629728962193318e-06, "loss": 0.2289, "step": 1991 }, { "epoch": 2.6210526315789475, "grad_norm": 0.08512850244099619, "learning_rate": 9.564283930242257e-06, "loss": 0.2251, "step": 1992 }, { "epoch": 2.6223684210526317, "grad_norm": 0.0855143103873754, "learning_rate": 9.499050873430482e-06, "loss": 0.2253, "step": 1993 }, { "epoch": 2.623684210526316, "grad_norm": 0.08812972174025184, "learning_rate": 9.434029944659872e-06, "loss": 0.2227, "step": 1994 }, { "epoch": 2.625, "grad_norm": 0.08923681105940687, "learning_rate": 9.369221296335006e-06, "loss": 0.2519, "step": 1995 }, { "epoch": 2.626315789473684, "grad_norm": 0.08888942368903012, "learning_rate": 9.30462508036294e-06, "loss": 0.2451, "step": 1996 }, { "epoch": 2.6276315789473683, "grad_norm": 0.08691289008570331, "learning_rate": 9.240241448152787e-06, "loss": 0.235, "step": 1997 }, { "epoch": 2.6289473684210525, "grad_norm": 0.08587149284301165, "learning_rate": 9.176070550615378e-06, "loss": 0.2273, "step": 1998 }, { "epoch": 2.6302631578947366, "grad_norm": 0.08822548058662702, "learning_rate": 9.112112538162898e-06, "loss": 0.2356, "step": 1999 }, { "epoch": 2.6315789473684212, "grad_norm": 0.08348460888767743, "learning_rate": 9.048367560708604e-06, "loss": 0.2297, "step": 2000 }, { "epoch": 2.6328947368421054, "grad_norm": 0.08551510433915722, "learning_rate": 8.98483576766631e-06, "loss": 0.2324, "step": 2001 }, { "epoch": 2.6342105263157896, "grad_norm": 0.08388165924617051, "learning_rate": 8.921517307950255e-06, "loss": 0.229, "step": 2002 }, { "epoch": 2.6355263157894737, "grad_norm": 0.08896318131104192, "learning_rate": 8.858412329974552e-06, "loss": 0.2385, "step": 2003 }, { "epoch": 2.636842105263158, "grad_norm": 0.09015074541095214, "learning_rate": 8.795520981652961e-06, "loss": 0.2392, "step": 2004 }, { "epoch": 2.638157894736842, "grad_norm": 0.08696268311755063, "learning_rate": 8.732843410398506e-06, "loss": 0.2294, "step": 2005 }, { "epoch": 2.639473684210526, "grad_norm": 0.08769627213456496, "learning_rate": 8.670379763123126e-06, "loss": 0.2384, "step": 2006 }, { "epoch": 2.640789473684211, "grad_norm": 0.08643115429762364, "learning_rate": 8.608130186237329e-06, "loss": 0.25, "step": 2007 }, { "epoch": 2.6421052631578945, "grad_norm": 0.08746848122982008, "learning_rate": 8.546094825649908e-06, "loss": 0.2398, "step": 2008 }, { "epoch": 2.643421052631579, "grad_norm": 0.08700832769044184, "learning_rate": 8.484273826767474e-06, "loss": 0.2315, "step": 2009 }, { "epoch": 2.6447368421052633, "grad_norm": 0.08453786644008071, "learning_rate": 8.422667334494249e-06, "loss": 0.2285, "step": 2010 }, { "epoch": 2.6460526315789474, "grad_norm": 0.08586615693833513, "learning_rate": 8.361275493231646e-06, "loss": 0.2364, "step": 2011 }, { "epoch": 2.6473684210526316, "grad_norm": 0.0870712891159542, "learning_rate": 8.300098446877923e-06, "loss": 0.2265, "step": 2012 }, { "epoch": 2.6486842105263158, "grad_norm": 0.08440192942964624, "learning_rate": 8.239136338827903e-06, "loss": 0.2291, "step": 2013 }, { "epoch": 2.65, "grad_norm": 0.08705307636244689, "learning_rate": 8.178389311972612e-06, "loss": 0.2392, "step": 2014 }, { "epoch": 2.651315789473684, "grad_norm": 0.08894269433728565, "learning_rate": 8.1178575086989e-06, "loss": 0.2315, "step": 2015 }, { "epoch": 2.6526315789473687, "grad_norm": 0.08919405616773922, "learning_rate": 8.05754107088923e-06, "loss": 0.2429, "step": 2016 }, { "epoch": 2.6539473684210524, "grad_norm": 0.08652861044002905, "learning_rate": 7.997440139921152e-06, "loss": 0.2386, "step": 2017 }, { "epoch": 2.655263157894737, "grad_norm": 0.08615918194801953, "learning_rate": 7.937554856667196e-06, "loss": 0.2275, "step": 2018 }, { "epoch": 2.656578947368421, "grad_norm": 0.0867406788792665, "learning_rate": 7.877885361494353e-06, "loss": 0.2294, "step": 2019 }, { "epoch": 2.6578947368421053, "grad_norm": 0.08726072526321839, "learning_rate": 7.818431794263836e-06, "loss": 0.236, "step": 2020 }, { "epoch": 2.6592105263157895, "grad_norm": 0.08547170774404624, "learning_rate": 7.759194294330751e-06, "loss": 0.2416, "step": 2021 }, { "epoch": 2.6605263157894736, "grad_norm": 0.08743733901669834, "learning_rate": 7.700173000543742e-06, "loss": 0.2409, "step": 2022 }, { "epoch": 2.661842105263158, "grad_norm": 0.08978412000343398, "learning_rate": 7.641368051244679e-06, "loss": 0.2303, "step": 2023 }, { "epoch": 2.663157894736842, "grad_norm": 0.08595515849525466, "learning_rate": 7.582779584268373e-06, "loss": 0.2392, "step": 2024 }, { "epoch": 2.6644736842105265, "grad_norm": 0.08931169378561088, "learning_rate": 7.524407736942174e-06, "loss": 0.236, "step": 2025 }, { "epoch": 2.6657894736842103, "grad_norm": 0.08550626450400388, "learning_rate": 7.466252646085703e-06, "loss": 0.2351, "step": 2026 }, { "epoch": 2.667105263157895, "grad_norm": 0.08787743584655944, "learning_rate": 7.4083144480105335e-06, "loss": 0.2464, "step": 2027 }, { "epoch": 2.668421052631579, "grad_norm": 0.0875076538218403, "learning_rate": 7.350593278519824e-06, "loss": 0.238, "step": 2028 }, { "epoch": 2.669736842105263, "grad_norm": 0.09154668653402206, "learning_rate": 7.2930892729080716e-06, "loss": 0.2381, "step": 2029 }, { "epoch": 2.6710526315789473, "grad_norm": 0.08720851842242237, "learning_rate": 7.235802565960714e-06, "loss": 0.2394, "step": 2030 }, { "epoch": 2.6723684210526315, "grad_norm": 0.08749018793861221, "learning_rate": 7.178733291953865e-06, "loss": 0.2331, "step": 2031 }, { "epoch": 2.6736842105263157, "grad_norm": 0.08792096320231124, "learning_rate": 7.121881584654056e-06, "loss": 0.2415, "step": 2032 }, { "epoch": 2.675, "grad_norm": 0.08722269899883195, "learning_rate": 7.0652475773177464e-06, "loss": 0.236, "step": 2033 }, { "epoch": 2.6763157894736844, "grad_norm": 0.08769168784492533, "learning_rate": 7.00883140269123e-06, "loss": 0.2409, "step": 2034 }, { "epoch": 2.6776315789473686, "grad_norm": 0.09065018938662237, "learning_rate": 6.95263319301015e-06, "loss": 0.244, "step": 2035 }, { "epoch": 2.6789473684210527, "grad_norm": 0.08628478345157022, "learning_rate": 6.896653079999249e-06, "loss": 0.2418, "step": 2036 }, { "epoch": 2.680263157894737, "grad_norm": 0.0847424030869481, "learning_rate": 6.840891194872112e-06, "loss": 0.2367, "step": 2037 }, { "epoch": 2.681578947368421, "grad_norm": 0.09082411672610778, "learning_rate": 6.785347668330777e-06, "loss": 0.2496, "step": 2038 }, { "epoch": 2.682894736842105, "grad_norm": 0.08829134391849804, "learning_rate": 6.730022630565458e-06, "loss": 0.242, "step": 2039 }, { "epoch": 2.6842105263157894, "grad_norm": 0.08183719511764015, "learning_rate": 6.674916211254289e-06, "loss": 0.2239, "step": 2040 }, { "epoch": 2.6855263157894735, "grad_norm": 0.08448377337525598, "learning_rate": 6.620028539562939e-06, "loss": 0.2306, "step": 2041 }, { "epoch": 2.6868421052631577, "grad_norm": 0.0850861974045535, "learning_rate": 6.565359744144373e-06, "loss": 0.2434, "step": 2042 }, { "epoch": 2.6881578947368423, "grad_norm": 0.08652193833655933, "learning_rate": 6.510909953138511e-06, "loss": 0.2399, "step": 2043 }, { "epoch": 2.6894736842105265, "grad_norm": 0.08571622313764427, "learning_rate": 6.45667929417193e-06, "loss": 0.2168, "step": 2044 }, { "epoch": 2.6907894736842106, "grad_norm": 0.0891461015547161, "learning_rate": 6.402667894357595e-06, "loss": 0.2358, "step": 2045 }, { "epoch": 2.692105263157895, "grad_norm": 0.08690415491550392, "learning_rate": 6.3488758802945354e-06, "loss": 0.2306, "step": 2046 }, { "epoch": 2.693421052631579, "grad_norm": 0.08712160901508374, "learning_rate": 6.2953033780675406e-06, "loss": 0.2435, "step": 2047 }, { "epoch": 2.694736842105263, "grad_norm": 0.08600309013612613, "learning_rate": 6.2419505132469305e-06, "loss": 0.2348, "step": 2048 }, { "epoch": 2.6960526315789473, "grad_norm": 0.08761511265084414, "learning_rate": 6.188817410888148e-06, "loss": 0.2362, "step": 2049 }, { "epoch": 2.6973684210526314, "grad_norm": 0.08786468959375783, "learning_rate": 6.1359041955315725e-06, "loss": 0.228, "step": 2050 }, { "epoch": 2.6986842105263156, "grad_norm": 0.08424404719132456, "learning_rate": 6.083210991202148e-06, "loss": 0.2309, "step": 2051 }, { "epoch": 2.7, "grad_norm": 0.08855984411695608, "learning_rate": 6.030737921409169e-06, "loss": 0.2481, "step": 2052 }, { "epoch": 2.7013157894736843, "grad_norm": 0.08622212781661236, "learning_rate": 5.978485109145904e-06, "loss": 0.2316, "step": 2053 }, { "epoch": 2.7026315789473685, "grad_norm": 0.0877254168652046, "learning_rate": 5.926452676889383e-06, "loss": 0.2442, "step": 2054 }, { "epoch": 2.7039473684210527, "grad_norm": 0.08959699248737611, "learning_rate": 5.8746407466000464e-06, "loss": 0.2334, "step": 2055 }, { "epoch": 2.705263157894737, "grad_norm": 0.08448005747326008, "learning_rate": 5.823049439721561e-06, "loss": 0.2246, "step": 2056 }, { "epoch": 2.706578947368421, "grad_norm": 0.087622602007786, "learning_rate": 5.771678877180408e-06, "loss": 0.2301, "step": 2057 }, { "epoch": 2.707894736842105, "grad_norm": 0.08772663242966719, "learning_rate": 5.720529179385659e-06, "loss": 0.2335, "step": 2058 }, { "epoch": 2.7092105263157897, "grad_norm": 0.08569105342372876, "learning_rate": 5.669600466228742e-06, "loss": 0.2334, "step": 2059 }, { "epoch": 2.7105263157894735, "grad_norm": 0.08468267671093112, "learning_rate": 5.618892857083069e-06, "loss": 0.2261, "step": 2060 }, { "epoch": 2.711842105263158, "grad_norm": 0.08830354382676614, "learning_rate": 5.568406470803799e-06, "loss": 0.2395, "step": 2061 }, { "epoch": 2.713157894736842, "grad_norm": 0.08578516507839022, "learning_rate": 5.518141425727586e-06, "loss": 0.228, "step": 2062 }, { "epoch": 2.7144736842105264, "grad_norm": 0.08452732187319882, "learning_rate": 5.468097839672237e-06, "loss": 0.2139, "step": 2063 }, { "epoch": 2.7157894736842105, "grad_norm": 0.08925179167431409, "learning_rate": 5.418275829936537e-06, "loss": 0.2473, "step": 2064 }, { "epoch": 2.7171052631578947, "grad_norm": 0.08802009007215943, "learning_rate": 5.3686755132998475e-06, "loss": 0.2354, "step": 2065 }, { "epoch": 2.718421052631579, "grad_norm": 0.08638926510806848, "learning_rate": 5.319297006021917e-06, "loss": 0.2276, "step": 2066 }, { "epoch": 2.719736842105263, "grad_norm": 0.08649040265863565, "learning_rate": 5.270140423842607e-06, "loss": 0.2319, "step": 2067 }, { "epoch": 2.7210526315789476, "grad_norm": 0.09027369123729026, "learning_rate": 5.221205881981595e-06, "loss": 0.2362, "step": 2068 }, { "epoch": 2.7223684210526313, "grad_norm": 0.09174442002171228, "learning_rate": 5.1724934951380755e-06, "loss": 0.241, "step": 2069 }, { "epoch": 2.723684210526316, "grad_norm": 0.08685169046569569, "learning_rate": 5.124003377490582e-06, "loss": 0.2389, "step": 2070 }, { "epoch": 2.725, "grad_norm": 0.08724300958525942, "learning_rate": 5.075735642696611e-06, "loss": 0.2284, "step": 2071 }, { "epoch": 2.7263157894736842, "grad_norm": 0.08498489763962802, "learning_rate": 5.02769040389246e-06, "loss": 0.2242, "step": 2072 }, { "epoch": 2.7276315789473684, "grad_norm": 0.08637681029702944, "learning_rate": 4.979867773692881e-06, "loss": 0.2304, "step": 2073 }, { "epoch": 2.7289473684210526, "grad_norm": 0.08534225285490787, "learning_rate": 4.932267864190832e-06, "loss": 0.2241, "step": 2074 }, { "epoch": 2.7302631578947367, "grad_norm": 0.08306302707156428, "learning_rate": 4.884890786957264e-06, "loss": 0.2215, "step": 2075 }, { "epoch": 2.731578947368421, "grad_norm": 0.08910155948565507, "learning_rate": 4.8377366530408254e-06, "loss": 0.2356, "step": 2076 }, { "epoch": 2.7328947368421055, "grad_norm": 0.0864142099489445, "learning_rate": 4.790805572967549e-06, "loss": 0.2376, "step": 2077 }, { "epoch": 2.734210526315789, "grad_norm": 0.08777769993987874, "learning_rate": 4.744097656740709e-06, "loss": 0.2296, "step": 2078 }, { "epoch": 2.735526315789474, "grad_norm": 0.08927992176441508, "learning_rate": 4.697613013840441e-06, "loss": 0.245, "step": 2079 }, { "epoch": 2.736842105263158, "grad_norm": 0.08952627138575911, "learning_rate": 4.65135175322361e-06, "loss": 0.2361, "step": 2080 }, { "epoch": 2.738157894736842, "grad_norm": 0.0887680485842762, "learning_rate": 4.605313983323423e-06, "loss": 0.2384, "step": 2081 }, { "epoch": 2.7394736842105263, "grad_norm": 0.08612832481286653, "learning_rate": 4.559499812049251e-06, "loss": 0.2299, "step": 2082 }, { "epoch": 2.7407894736842104, "grad_norm": 0.08792908730829178, "learning_rate": 4.513909346786427e-06, "loss": 0.24, "step": 2083 }, { "epoch": 2.7421052631578946, "grad_norm": 0.08726287355056869, "learning_rate": 4.468542694395861e-06, "loss": 0.2355, "step": 2084 }, { "epoch": 2.7434210526315788, "grad_norm": 0.08377835577746928, "learning_rate": 4.423399961213892e-06, "loss": 0.2242, "step": 2085 }, { "epoch": 2.7447368421052634, "grad_norm": 0.08736709748947386, "learning_rate": 4.378481253051991e-06, "loss": 0.2313, "step": 2086 }, { "epoch": 2.7460526315789475, "grad_norm": 0.08840579315321198, "learning_rate": 4.333786675196539e-06, "loss": 0.2494, "step": 2087 }, { "epoch": 2.7473684210526317, "grad_norm": 0.08786933648344666, "learning_rate": 4.2893163324085885e-06, "loss": 0.2391, "step": 2088 }, { "epoch": 2.748684210526316, "grad_norm": 0.0864833290624945, "learning_rate": 4.245070328923584e-06, "loss": 0.2368, "step": 2089 }, { "epoch": 2.75, "grad_norm": 0.08633401270649953, "learning_rate": 4.20104876845111e-06, "loss": 0.2371, "step": 2090 }, { "epoch": 2.751315789473684, "grad_norm": 0.0872957150222657, "learning_rate": 4.1572517541747294e-06, "loss": 0.2411, "step": 2091 }, { "epoch": 2.7526315789473683, "grad_norm": 0.08503214350266516, "learning_rate": 4.1136793887516345e-06, "loss": 0.2208, "step": 2092 }, { "epoch": 2.7539473684210525, "grad_norm": 0.08709188346280049, "learning_rate": 4.070331774312486e-06, "loss": 0.2518, "step": 2093 }, { "epoch": 2.7552631578947366, "grad_norm": 0.08684645991108973, "learning_rate": 4.027209012461108e-06, "loss": 0.2442, "step": 2094 }, { "epoch": 2.7565789473684212, "grad_norm": 0.0857591344523911, "learning_rate": 3.9843112042743045e-06, "loss": 0.2321, "step": 2095 }, { "epoch": 2.7578947368421054, "grad_norm": 0.08806771319356137, "learning_rate": 3.941638450301644e-06, "loss": 0.2497, "step": 2096 }, { "epoch": 2.7592105263157896, "grad_norm": 0.08562267017890308, "learning_rate": 3.899190850565115e-06, "loss": 0.2305, "step": 2097 }, { "epoch": 2.7605263157894737, "grad_norm": 0.0875801485743485, "learning_rate": 3.856968504558989e-06, "loss": 0.2461, "step": 2098 }, { "epoch": 2.761842105263158, "grad_norm": 0.086786356254571, "learning_rate": 3.814971511249576e-06, "loss": 0.2399, "step": 2099 }, { "epoch": 2.763157894736842, "grad_norm": 0.08869720806043141, "learning_rate": 3.7731999690749585e-06, "loss": 0.2418, "step": 2100 }, { "epoch": 2.764473684210526, "grad_norm": 0.08740038989320052, "learning_rate": 3.731653975944782e-06, "loss": 0.2313, "step": 2101 }, { "epoch": 2.765789473684211, "grad_norm": 0.08421195141482454, "learning_rate": 3.690333629239995e-06, "loss": 0.2452, "step": 2102 }, { "epoch": 2.7671052631578945, "grad_norm": 0.08786793240306752, "learning_rate": 3.6492390258126673e-06, "loss": 0.2447, "step": 2103 }, { "epoch": 2.768421052631579, "grad_norm": 0.08677843897373576, "learning_rate": 3.6083702619857605e-06, "loss": 0.2302, "step": 2104 }, { "epoch": 2.7697368421052633, "grad_norm": 0.08762759404913377, "learning_rate": 3.567727433552859e-06, "loss": 0.2398, "step": 2105 }, { "epoch": 2.7710526315789474, "grad_norm": 0.08696670352877477, "learning_rate": 3.5273106357779585e-06, "loss": 0.2303, "step": 2106 }, { "epoch": 2.7723684210526316, "grad_norm": 0.08376635095287631, "learning_rate": 3.4871199633953024e-06, "loss": 0.2312, "step": 2107 }, { "epoch": 2.7736842105263158, "grad_norm": 0.08618277832613587, "learning_rate": 3.447155510609057e-06, "loss": 0.2297, "step": 2108 }, { "epoch": 2.775, "grad_norm": 0.0846951369483359, "learning_rate": 3.40741737109318e-06, "loss": 0.2277, "step": 2109 }, { "epoch": 2.776315789473684, "grad_norm": 0.08571322444230144, "learning_rate": 3.367905637991142e-06, "loss": 0.235, "step": 2110 }, { "epoch": 2.7776315789473687, "grad_norm": 0.08587956669273608, "learning_rate": 3.328620403915761e-06, "loss": 0.2309, "step": 2111 }, { "epoch": 2.7789473684210524, "grad_norm": 0.08783109845186413, "learning_rate": 3.2895617609489336e-06, "loss": 0.2297, "step": 2112 }, { "epoch": 2.780263157894737, "grad_norm": 0.08554797023694241, "learning_rate": 3.2507298006414497e-06, "loss": 0.235, "step": 2113 }, { "epoch": 2.781578947368421, "grad_norm": 0.08680930677926671, "learning_rate": 3.212124614012768e-06, "loss": 0.2211, "step": 2114 }, { "epoch": 2.7828947368421053, "grad_norm": 0.08766866674749912, "learning_rate": 3.1737462915508277e-06, "loss": 0.2337, "step": 2115 }, { "epoch": 2.7842105263157895, "grad_norm": 0.08826029132090077, "learning_rate": 3.135594923211771e-06, "loss": 0.2422, "step": 2116 }, { "epoch": 2.7855263157894736, "grad_norm": 0.08636371899570172, "learning_rate": 3.0976705984198106e-06, "loss": 0.2338, "step": 2117 }, { "epoch": 2.786842105263158, "grad_norm": 0.09050217986657871, "learning_rate": 3.059973406066963e-06, "loss": 0.228, "step": 2118 }, { "epoch": 2.788157894736842, "grad_norm": 0.08503847802029245, "learning_rate": 3.02250343451288e-06, "loss": 0.2271, "step": 2119 }, { "epoch": 2.7894736842105265, "grad_norm": 0.08703828648807459, "learning_rate": 2.9852607715846193e-06, "loss": 0.2366, "step": 2120 }, { "epoch": 2.7907894736842103, "grad_norm": 0.08829271415423992, "learning_rate": 2.948245504576419e-06, "loss": 0.2426, "step": 2121 }, { "epoch": 2.792105263157895, "grad_norm": 0.08927594751571939, "learning_rate": 2.9114577202495553e-06, "loss": 0.2404, "step": 2122 }, { "epoch": 2.793421052631579, "grad_norm": 0.08694095520332024, "learning_rate": 2.874897504832075e-06, "loss": 0.2378, "step": 2123 }, { "epoch": 2.794736842105263, "grad_norm": 0.08685404680967446, "learning_rate": 2.838564944018618e-06, "loss": 0.2321, "step": 2124 }, { "epoch": 2.7960526315789473, "grad_norm": 0.08768289772849402, "learning_rate": 2.802460122970241e-06, "loss": 0.241, "step": 2125 }, { "epoch": 2.7973684210526315, "grad_norm": 0.08607902569363003, "learning_rate": 2.7665831263141593e-06, "loss": 0.233, "step": 2126 }, { "epoch": 2.7986842105263157, "grad_norm": 0.08510692543767813, "learning_rate": 2.730934038143607e-06, "loss": 0.2356, "step": 2127 }, { "epoch": 2.8, "grad_norm": 0.08529239376866304, "learning_rate": 2.6955129420176196e-06, "loss": 0.2368, "step": 2128 }, { "epoch": 2.8013157894736844, "grad_norm": 0.08535080650525781, "learning_rate": 2.6603199209608187e-06, "loss": 0.225, "step": 2129 }, { "epoch": 2.8026315789473686, "grad_norm": 0.08544159810069368, "learning_rate": 2.6253550574632303e-06, "loss": 0.2278, "step": 2130 }, { "epoch": 2.8039473684210527, "grad_norm": 0.0864151284971542, "learning_rate": 2.5906184334801297e-06, "loss": 0.24, "step": 2131 }, { "epoch": 2.805263157894737, "grad_norm": 0.08574975605717337, "learning_rate": 2.556110130431788e-06, "loss": 0.2266, "step": 2132 }, { "epoch": 2.806578947368421, "grad_norm": 0.0857571071786596, "learning_rate": 2.5218302292032816e-06, "loss": 0.234, "step": 2133 }, { "epoch": 2.807894736842105, "grad_norm": 0.08886112337240165, "learning_rate": 2.487778810144381e-06, "loss": 0.2369, "step": 2134 }, { "epoch": 2.8092105263157894, "grad_norm": 0.08877261222424743, "learning_rate": 2.4539559530692758e-06, "loss": 0.2304, "step": 2135 }, { "epoch": 2.8105263157894735, "grad_norm": 0.08399044671282317, "learning_rate": 2.420361737256438e-06, "loss": 0.2282, "step": 2136 }, { "epoch": 2.8118421052631577, "grad_norm": 0.08240687984849308, "learning_rate": 2.3869962414484137e-06, "loss": 0.2231, "step": 2137 }, { "epoch": 2.8131578947368423, "grad_norm": 0.08767361332290152, "learning_rate": 2.353859543851644e-06, "loss": 0.2296, "step": 2138 }, { "epoch": 2.8144736842105265, "grad_norm": 0.08790627521525603, "learning_rate": 2.3209517221362777e-06, "loss": 0.2293, "step": 2139 }, { "epoch": 2.8157894736842106, "grad_norm": 0.08622647934206558, "learning_rate": 2.288272853436013e-06, "loss": 0.2325, "step": 2140 }, { "epoch": 2.817105263157895, "grad_norm": 0.08713384140492961, "learning_rate": 2.2558230143478797e-06, "loss": 0.2276, "step": 2141 }, { "epoch": 2.818421052631579, "grad_norm": 0.08415303494815114, "learning_rate": 2.22360228093208e-06, "loss": 0.221, "step": 2142 }, { "epoch": 2.819736842105263, "grad_norm": 0.08558597543600141, "learning_rate": 2.1916107287118015e-06, "loss": 0.2293, "step": 2143 }, { "epoch": 2.8210526315789473, "grad_norm": 0.08465218463761441, "learning_rate": 2.1598484326730837e-06, "loss": 0.2314, "step": 2144 }, { "epoch": 2.8223684210526314, "grad_norm": 0.08774203550326841, "learning_rate": 2.128315467264552e-06, "loss": 0.2359, "step": 2145 }, { "epoch": 2.8236842105263156, "grad_norm": 0.08602355712846993, "learning_rate": 2.097011906397339e-06, "loss": 0.2414, "step": 2146 }, { "epoch": 2.825, "grad_norm": 0.09054024554229902, "learning_rate": 2.0659378234448525e-06, "loss": 0.2335, "step": 2147 }, { "epoch": 2.8263157894736843, "grad_norm": 0.08533131567031418, "learning_rate": 2.035093291242607e-06, "loss": 0.2343, "step": 2148 }, { "epoch": 2.8276315789473685, "grad_norm": 0.0884644294438792, "learning_rate": 2.004478382088093e-06, "loss": 0.2345, "step": 2149 }, { "epoch": 2.8289473684210527, "grad_norm": 0.08947476508746073, "learning_rate": 1.974093167740565e-06, "loss": 0.2335, "step": 2150 }, { "epoch": 2.830263157894737, "grad_norm": 0.086504450962513, "learning_rate": 1.943937719420863e-06, "loss": 0.2414, "step": 2151 }, { "epoch": 2.831578947368421, "grad_norm": 0.08547620700378816, "learning_rate": 1.914012107811336e-06, "loss": 0.2304, "step": 2152 }, { "epoch": 2.832894736842105, "grad_norm": 0.08406046395791139, "learning_rate": 1.8843164030555527e-06, "loss": 0.2224, "step": 2153 }, { "epoch": 2.8342105263157897, "grad_norm": 0.0858958830300312, "learning_rate": 1.8548506747582129e-06, "loss": 0.2299, "step": 2154 }, { "epoch": 2.8355263157894735, "grad_norm": 0.08713981466515258, "learning_rate": 1.8256149919849807e-06, "loss": 0.24, "step": 2155 }, { "epoch": 2.836842105263158, "grad_norm": 0.08722563464111789, "learning_rate": 1.7966094232622855e-06, "loss": 0.2369, "step": 2156 }, { "epoch": 2.838157894736842, "grad_norm": 0.08782546599587424, "learning_rate": 1.7678340365772206e-06, "loss": 0.2339, "step": 2157 }, { "epoch": 2.8394736842105264, "grad_norm": 0.08820898856927462, "learning_rate": 1.7392888993773005e-06, "loss": 0.2294, "step": 2158 }, { "epoch": 2.8407894736842105, "grad_norm": 0.08559547137262101, "learning_rate": 1.7109740785703933e-06, "loss": 0.233, "step": 2159 }, { "epoch": 2.8421052631578947, "grad_norm": 0.08651933564372974, "learning_rate": 1.6828896405244988e-06, "loss": 0.2326, "step": 2160 }, { "epoch": 2.843421052631579, "grad_norm": 0.08423235088748676, "learning_rate": 1.6550356510676268e-06, "loss": 0.2314, "step": 2161 }, { "epoch": 2.844736842105263, "grad_norm": 0.08646992121393281, "learning_rate": 1.6274121754876082e-06, "loss": 0.2358, "step": 2162 }, { "epoch": 2.8460526315789476, "grad_norm": 0.08897157982270418, "learning_rate": 1.6000192785320057e-06, "loss": 0.2421, "step": 2163 }, { "epoch": 2.8473684210526313, "grad_norm": 0.08753330536214494, "learning_rate": 1.572857024407881e-06, "loss": 0.2367, "step": 2164 }, { "epoch": 2.848684210526316, "grad_norm": 0.08917423233859062, "learning_rate": 1.5459254767817066e-06, "loss": 0.2337, "step": 2165 }, { "epoch": 2.85, "grad_norm": 0.08718844159110516, "learning_rate": 1.5192246987791981e-06, "loss": 0.2316, "step": 2166 }, { "epoch": 2.8513157894736842, "grad_norm": 0.08666099472117271, "learning_rate": 1.4927547529851371e-06, "loss": 0.2145, "step": 2167 }, { "epoch": 2.8526315789473684, "grad_norm": 0.08442460380559827, "learning_rate": 1.466515701443294e-06, "loss": 0.2332, "step": 2168 }, { "epoch": 2.8539473684210526, "grad_norm": 0.08449859016985016, "learning_rate": 1.4405076056561828e-06, "loss": 0.2266, "step": 2169 }, { "epoch": 2.8552631578947367, "grad_norm": 0.08545748600057546, "learning_rate": 1.4147305265850175e-06, "loss": 0.2298, "step": 2170 }, { "epoch": 2.856578947368421, "grad_norm": 0.08929470679144376, "learning_rate": 1.3891845246495228e-06, "loss": 0.2343, "step": 2171 }, { "epoch": 2.8578947368421055, "grad_norm": 0.08668972660138131, "learning_rate": 1.3638696597277679e-06, "loss": 0.2423, "step": 2172 }, { "epoch": 2.859210526315789, "grad_norm": 0.08262056937095345, "learning_rate": 1.3387859911560664e-06, "loss": 0.2285, "step": 2173 }, { "epoch": 2.860526315789474, "grad_norm": 0.08830815458424673, "learning_rate": 1.3139335777288208e-06, "loss": 0.2334, "step": 2174 }, { "epoch": 2.861842105263158, "grad_norm": 0.08675825238016463, "learning_rate": 1.28931247769839e-06, "loss": 0.2382, "step": 2175 }, { "epoch": 2.863157894736842, "grad_norm": 0.08893171231303189, "learning_rate": 1.2649227487749548e-06, "loss": 0.2336, "step": 2176 }, { "epoch": 2.8644736842105263, "grad_norm": 0.08924178196183445, "learning_rate": 1.2407644481263858e-06, "loss": 0.2408, "step": 2177 }, { "epoch": 2.8657894736842104, "grad_norm": 0.08715372943476031, "learning_rate": 1.216837632378065e-06, "loss": 0.2315, "step": 2178 }, { "epoch": 2.8671052631578946, "grad_norm": 0.08660959792084139, "learning_rate": 1.1931423576128197e-06, "loss": 0.2416, "step": 2179 }, { "epoch": 2.8684210526315788, "grad_norm": 0.08853921306085291, "learning_rate": 1.1696786793707781e-06, "loss": 0.2335, "step": 2180 }, { "epoch": 2.8697368421052634, "grad_norm": 0.08640447512392578, "learning_rate": 1.146446652649169e-06, "loss": 0.2358, "step": 2181 }, { "epoch": 2.8710526315789475, "grad_norm": 0.08705591508512092, "learning_rate": 1.1234463319022893e-06, "loss": 0.2315, "step": 2182 }, { "epoch": 2.8723684210526317, "grad_norm": 0.08602413068650967, "learning_rate": 1.100677771041314e-06, "loss": 0.2235, "step": 2183 }, { "epoch": 2.873684210526316, "grad_norm": 0.08684507106024413, "learning_rate": 1.0781410234342094e-06, "loss": 0.24, "step": 2184 }, { "epoch": 2.875, "grad_norm": 0.09031170729723671, "learning_rate": 1.055836141905553e-06, "loss": 0.2338, "step": 2185 }, { "epoch": 2.876315789473684, "grad_norm": 0.08646029200839189, "learning_rate": 1.0337631787364687e-06, "loss": 0.2222, "step": 2186 }, { "epoch": 2.8776315789473683, "grad_norm": 0.08740568430137426, "learning_rate": 1.0119221856644712e-06, "loss": 0.2352, "step": 2187 }, { "epoch": 2.8789473684210525, "grad_norm": 0.08586656965987803, "learning_rate": 9.90313213883376e-07, "loss": 0.2263, "step": 2188 }, { "epoch": 2.8802631578947366, "grad_norm": 0.08991958053706495, "learning_rate": 9.689363140431118e-07, "loss": 0.2266, "step": 2189 }, { "epoch": 2.8815789473684212, "grad_norm": 0.08767297790926798, "learning_rate": 9.477915362496758e-07, "loss": 0.2441, "step": 2190 }, { "epoch": 2.8828947368421054, "grad_norm": 0.0898797640991881, "learning_rate": 9.268789300649894e-07, "loss": 0.2401, "step": 2191 }, { "epoch": 2.8842105263157896, "grad_norm": 0.08612849691104628, "learning_rate": 9.061985445067756e-07, "loss": 0.2281, "step": 2192 }, { "epoch": 2.8855263157894737, "grad_norm": 0.08949331264552439, "learning_rate": 8.857504280484375e-07, "loss": 0.2547, "step": 2193 }, { "epoch": 2.886842105263158, "grad_norm": 0.0851227718307587, "learning_rate": 8.65534628618958e-07, "loss": 0.2297, "step": 2194 }, { "epoch": 2.888157894736842, "grad_norm": 0.08827554977188867, "learning_rate": 8.455511936028004e-07, "loss": 0.2411, "step": 2195 }, { "epoch": 2.889473684210526, "grad_norm": 0.08846981462555736, "learning_rate": 8.258001698397744e-07, "loss": 0.2389, "step": 2196 }, { "epoch": 2.890789473684211, "grad_norm": 0.08664150903305488, "learning_rate": 8.062816036249143e-07, "loss": 0.2345, "step": 2197 }, { "epoch": 2.8921052631578945, "grad_norm": 0.0854396346836948, "learning_rate": 7.86995540708424e-07, "loss": 0.2318, "step": 2198 }, { "epoch": 2.893421052631579, "grad_norm": 0.08695987453269646, "learning_rate": 7.679420262954984e-07, "loss": 0.2424, "step": 2199 }, { "epoch": 2.8947368421052633, "grad_norm": 0.08752689227211352, "learning_rate": 7.491211050462798e-07, "loss": 0.2304, "step": 2200 }, { "epoch": 2.8960526315789474, "grad_norm": 0.08666285275563744, "learning_rate": 7.305328210757356e-07, "loss": 0.2299, "step": 2201 }, { "epoch": 2.8973684210526316, "grad_norm": 0.08663314353560102, "learning_rate": 7.121772179535135e-07, "loss": 0.2424, "step": 2202 }, { "epoch": 2.8986842105263158, "grad_norm": 0.08458000982804612, "learning_rate": 6.94054338703909e-07, "loss": 0.2316, "step": 2203 }, { "epoch": 2.9, "grad_norm": 0.08821934440133324, "learning_rate": 6.761642258056978e-07, "loss": 0.2276, "step": 2204 }, { "epoch": 2.901315789473684, "grad_norm": 0.08815511451469815, "learning_rate": 6.585069211921035e-07, "loss": 0.2379, "step": 2205 }, { "epoch": 2.9026315789473687, "grad_norm": 0.08529196165876289, "learning_rate": 6.410824662506198e-07, "loss": 0.2296, "step": 2206 }, { "epoch": 2.9039473684210524, "grad_norm": 0.08451900774747255, "learning_rate": 6.238909018229766e-07, "loss": 0.2269, "step": 2207 }, { "epoch": 2.905263157894737, "grad_norm": 0.08696348620201358, "learning_rate": 6.069322682050516e-07, "loss": 0.2324, "step": 2208 }, { "epoch": 2.906578947368421, "grad_norm": 0.08828910381160303, "learning_rate": 5.902066051467037e-07, "loss": 0.2357, "step": 2209 }, { "epoch": 2.9078947368421053, "grad_norm": 0.08483736507293088, "learning_rate": 5.737139518517509e-07, "loss": 0.2296, "step": 2210 }, { "epoch": 2.9092105263157895, "grad_norm": 0.08743099417381292, "learning_rate": 5.57454346977837e-07, "loss": 0.24, "step": 2211 }, { "epoch": 2.9105263157894736, "grad_norm": 0.08916802794242618, "learning_rate": 5.414278286363761e-07, "loss": 0.2304, "step": 2212 }, { "epoch": 2.911842105263158, "grad_norm": 0.08828220707425087, "learning_rate": 5.256344343924302e-07, "loss": 0.2419, "step": 2213 }, { "epoch": 2.913157894736842, "grad_norm": 0.08659025242576524, "learning_rate": 5.10074201264632e-07, "loss": 0.2166, "step": 2214 }, { "epoch": 2.9144736842105265, "grad_norm": 0.08525185958778574, "learning_rate": 4.947471657251068e-07, "loss": 0.2368, "step": 2215 }, { "epoch": 2.9157894736842103, "grad_norm": 0.08692863494788813, "learning_rate": 4.796533636993727e-07, "loss": 0.23, "step": 2216 }, { "epoch": 2.917105263157895, "grad_norm": 0.08819398487450925, "learning_rate": 4.647928305662852e-07, "loss": 0.2321, "step": 2217 }, { "epoch": 2.918421052631579, "grad_norm": 0.08936510229812145, "learning_rate": 4.501656011579036e-07, "loss": 0.2468, "step": 2218 }, { "epoch": 2.919736842105263, "grad_norm": 0.08708818005334107, "learning_rate": 4.3577170975945826e-07, "loss": 0.2322, "step": 2219 }, { "epoch": 2.9210526315789473, "grad_norm": 0.08388217210536535, "learning_rate": 4.216111901092501e-07, "loss": 0.2267, "step": 2220 }, { "epoch": 2.9223684210526315, "grad_norm": 0.08876373477705209, "learning_rate": 4.0768407539857333e-07, "loss": 0.2337, "step": 2221 }, { "epoch": 2.9236842105263157, "grad_norm": 0.08918972976359953, "learning_rate": 3.9399039827162643e-07, "loss": 0.2458, "step": 2222 }, { "epoch": 2.925, "grad_norm": 0.08612029906467318, "learning_rate": 3.805301908254455e-07, "loss": 0.2243, "step": 2223 }, { "epoch": 2.9263157894736844, "grad_norm": 0.08294679916711059, "learning_rate": 3.6730348460985996e-07, "loss": 0.2184, "step": 2224 }, { "epoch": 2.9276315789473686, "grad_norm": 0.08752162048000231, "learning_rate": 3.543103106273371e-07, "loss": 0.2457, "step": 2225 }, { "epoch": 2.9289473684210527, "grad_norm": 0.0859025435788662, "learning_rate": 3.415506993330153e-07, "loss": 0.2346, "step": 2226 }, { "epoch": 2.930263157894737, "grad_norm": 0.08793876747475723, "learning_rate": 3.2902468063453763e-07, "loss": 0.2391, "step": 2227 }, { "epoch": 2.931578947368421, "grad_norm": 0.08667819600458836, "learning_rate": 3.1673228389204055e-07, "loss": 0.2288, "step": 2228 }, { "epoch": 2.932894736842105, "grad_norm": 0.08786519232774889, "learning_rate": 3.046735379180543e-07, "loss": 0.2341, "step": 2229 }, { "epoch": 2.9342105263157894, "grad_norm": 0.0901967616524326, "learning_rate": 2.9284847097746923e-07, "loss": 0.2291, "step": 2230 }, { "epoch": 2.9355263157894735, "grad_norm": 0.08704968172092865, "learning_rate": 2.81257110787414e-07, "loss": 0.234, "step": 2231 }, { "epoch": 2.9368421052631577, "grad_norm": 0.08690111626726336, "learning_rate": 2.6989948451726643e-07, "loss": 0.2284, "step": 2232 }, { "epoch": 2.9381578947368423, "grad_norm": 0.08848680506181238, "learning_rate": 2.587756187885204e-07, "loss": 0.2313, "step": 2233 }, { "epoch": 2.9394736842105265, "grad_norm": 0.08596311834903651, "learning_rate": 2.4788553967474147e-07, "loss": 0.2361, "step": 2234 }, { "epoch": 2.9407894736842106, "grad_norm": 0.08490908394075776, "learning_rate": 2.372292727015557e-07, "loss": 0.2353, "step": 2235 }, { "epoch": 2.942105263157895, "grad_norm": 0.08292706658982998, "learning_rate": 2.2680684284650533e-07, "loss": 0.2309, "step": 2236 }, { "epoch": 2.943421052631579, "grad_norm": 0.08480952928040106, "learning_rate": 2.1661827453905992e-07, "loss": 0.2213, "step": 2237 }, { "epoch": 2.944736842105263, "grad_norm": 0.08635227023585973, "learning_rate": 2.066635916605386e-07, "loss": 0.2353, "step": 2238 }, { "epoch": 2.9460526315789473, "grad_norm": 0.08781565402914938, "learning_rate": 1.9694281754401024e-07, "loss": 0.2361, "step": 2239 }, { "epoch": 2.9473684210526314, "grad_norm": 0.08776480608964969, "learning_rate": 1.8745597497433765e-07, "loss": 0.2415, "step": 2240 }, { "epoch": 2.9486842105263156, "grad_norm": 0.08905978886837364, "learning_rate": 1.782030861880113e-07, "loss": 0.2308, "step": 2241 }, { "epoch": 2.95, "grad_norm": 0.08890792718531305, "learning_rate": 1.6918417287318245e-07, "loss": 0.2415, "step": 2242 }, { "epoch": 2.9513157894736843, "grad_norm": 0.08905016422570772, "learning_rate": 1.603992561695522e-07, "loss": 0.2378, "step": 2243 }, { "epoch": 2.9526315789473685, "grad_norm": 0.08893795963848528, "learning_rate": 1.518483566683826e-07, "loss": 0.2435, "step": 2244 }, { "epoch": 2.9539473684210527, "grad_norm": 0.08778434845315054, "learning_rate": 1.4353149441237445e-07, "loss": 0.2378, "step": 2245 }, { "epoch": 2.955263157894737, "grad_norm": 0.08482607245940768, "learning_rate": 1.3544868889571182e-07, "loss": 0.2206, "step": 2246 }, { "epoch": 2.956578947368421, "grad_norm": 0.08524277713713192, "learning_rate": 1.2759995906392874e-07, "loss": 0.2235, "step": 2247 }, { "epoch": 2.957894736842105, "grad_norm": 0.08760984571309041, "learning_rate": 1.199853233138981e-07, "loss": 0.2301, "step": 2248 }, { "epoch": 2.9592105263157897, "grad_norm": 0.08864338113540858, "learning_rate": 1.1260479949382064e-07, "loss": 0.241, "step": 2249 }, { "epoch": 2.9605263157894735, "grad_norm": 0.0860770469372616, "learning_rate": 1.0545840490313596e-07, "loss": 0.238, "step": 2250 }, { "epoch": 2.961842105263158, "grad_norm": 0.08309356073768617, "learning_rate": 9.854615629250053e-08, "loss": 0.2216, "step": 2251 }, { "epoch": 2.963157894736842, "grad_norm": 0.08693365254726673, "learning_rate": 9.186806986376529e-08, "loss": 0.229, "step": 2252 }, { "epoch": 2.9644736842105264, "grad_norm": 0.08720773613369974, "learning_rate": 8.542416126989805e-08, "loss": 0.2323, "step": 2253 }, { "epoch": 2.9657894736842105, "grad_norm": 0.089536393572043, "learning_rate": 7.921444561498348e-08, "loss": 0.2369, "step": 2254 }, { "epoch": 2.9671052631578947, "grad_norm": 0.08652678030587639, "learning_rate": 7.323893745416755e-08, "loss": 0.2398, "step": 2255 }, { "epoch": 2.968421052631579, "grad_norm": 0.08812917248570945, "learning_rate": 6.749765079363534e-08, "loss": 0.2262, "step": 2256 }, { "epoch": 2.969736842105263, "grad_norm": 0.08805384258384784, "learning_rate": 6.19905990905667e-08, "loss": 0.2264, "step": 2257 }, { "epoch": 2.9710526315789476, "grad_norm": 0.0845281113628984, "learning_rate": 5.6717795253113935e-08, "loss": 0.2296, "step": 2258 }, { "epoch": 2.9723684210526313, "grad_norm": 0.08972586281120497, "learning_rate": 5.167925164037968e-08, "loss": 0.2444, "step": 2259 }, { "epoch": 2.973684210526316, "grad_norm": 0.086226803286329, "learning_rate": 4.687498006236135e-08, "loss": 0.2377, "step": 2260 }, { "epoch": 2.975, "grad_norm": 0.08827582008385458, "learning_rate": 4.230499177994007e-08, "loss": 0.2275, "step": 2261 }, { "epoch": 2.9763157894736842, "grad_norm": 0.08805311064331064, "learning_rate": 3.796929750485845e-08, "loss": 0.2453, "step": 2262 }, { "epoch": 2.9776315789473684, "grad_norm": 0.08707716445589411, "learning_rate": 3.386790739968726e-08, "loss": 0.2484, "step": 2263 }, { "epoch": 2.9789473684210526, "grad_norm": 0.08937493080093048, "learning_rate": 3.000083107780327e-08, "loss": 0.2341, "step": 2264 }, { "epoch": 2.9802631578947367, "grad_norm": 0.08847213274992714, "learning_rate": 2.6368077603367015e-08, "loss": 0.2345, "step": 2265 }, { "epoch": 2.981578947368421, "grad_norm": 0.08752540842663672, "learning_rate": 2.2969655491311693e-08, "loss": 0.231, "step": 2266 }, { "epoch": 2.9828947368421055, "grad_norm": 0.08820844196032962, "learning_rate": 1.980557270729877e-08, "loss": 0.2403, "step": 2267 }, { "epoch": 2.984210526315789, "grad_norm": 0.08848035935218941, "learning_rate": 1.687583666772907e-08, "loss": 0.2475, "step": 2268 }, { "epoch": 2.985526315789474, "grad_norm": 0.08684909478055165, "learning_rate": 1.418045423968728e-08, "loss": 0.2276, "step": 2269 }, { "epoch": 2.986842105263158, "grad_norm": 0.08857801328002969, "learning_rate": 1.1719431740997433e-08, "loss": 0.2351, "step": 2270 }, { "epoch": 2.988157894736842, "grad_norm": 0.08626632234298558, "learning_rate": 9.49277494008971e-09, "loss": 0.2371, "step": 2271 }, { "epoch": 2.9894736842105263, "grad_norm": 0.08631453252565319, "learning_rate": 7.500489056133652e-09, "loss": 0.241, "step": 2272 }, { "epoch": 2.9907894736842104, "grad_norm": 0.08837237273152578, "learning_rate": 5.742578758882733e-09, "loss": 0.2304, "step": 2273 }, { "epoch": 2.9921052631578946, "grad_norm": 0.0862791762664004, "learning_rate": 4.219048168763174e-09, "loss": 0.2425, "step": 2274 }, { "epoch": 2.9934210526315788, "grad_norm": 0.08930817551049701, "learning_rate": 2.9299008568406396e-09, "loss": 0.2402, "step": 2275 }, { "epoch": 2.9947368421052634, "grad_norm": 0.08856125619943322, "learning_rate": 1.8751398447758306e-09, "loss": 0.2358, "step": 2276 }, { "epoch": 2.9960526315789475, "grad_norm": 0.08640583110457638, "learning_rate": 1.0547676048688892e-09, "loss": 0.2259, "step": 2277 }, { "epoch": 2.9973684210526317, "grad_norm": 0.08504843454486558, "learning_rate": 4.687860599927873e-10, "loss": 0.2402, "step": 2278 }, { "epoch": 2.998684210526316, "grad_norm": 0.08642252232900641, "learning_rate": 1.1719658367104202e-10, "loss": 0.2366, "step": 2279 }, { "epoch": 3.0, "grad_norm": 0.08478813908452047, "learning_rate": 0.0, "loss": 0.2337, "step": 2280 }, { "epoch": 3.0, "eval_loss": 0.2625243365764618, "eval_runtime": 136.232, "eval_samples_per_second": 37.568, "eval_steps_per_second": 1.174, "step": 2280 }, { "epoch": 3.0, "step": 2280, "total_flos": 6.730550114202419e+17, "train_loss": 0.28347440996583095, "train_runtime": 21673.3063, "train_samples_per_second": 13.46, "train_steps_per_second": 0.105 } ], "logging_steps": 1, "max_steps": 2280, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.730550114202419e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }