diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6286 +1,4202 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 14.961571306575577, + "epoch": 9.974380871050384, "eval_steps": 500, - "global_step": 4380, + "global_step": 2920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034158838599487617, - "grad_norm": 1.8984375, - "learning_rate": 4.5662100456621004e-07, + "grad_norm": 3.8125, + "learning_rate": 6.849315068493151e-07, "loss": 3.0658, "step": 1 }, { "epoch": 0.017079419299743808, - "grad_norm": 1.9609375, - "learning_rate": 2.2831050228310503e-06, - "loss": 3.0722, + "grad_norm": 3.921875, + "learning_rate": 3.4246575342465754e-06, + "loss": 3.0727, "step": 5 }, { "epoch": 0.034158838599487616, - "grad_norm": 1.8984375, - "learning_rate": 4.566210045662101e-06, - "loss": 3.0516, + "grad_norm": 4.59375, + "learning_rate": 6.849315068493151e-06, + "loss": 3.0381, "step": 10 }, { "epoch": 0.05123825789923143, - "grad_norm": 2.0625, - "learning_rate": 6.849315068493151e-06, - "loss": 3.0565, + "grad_norm": 3.375, + "learning_rate": 1.0273972602739726e-05, + "loss": 2.9796, "step": 15 }, { "epoch": 0.06831767719897523, - "grad_norm": 4.125, - "learning_rate": 9.132420091324201e-06, - "loss": 3.0491, + "grad_norm": 2.546875, + "learning_rate": 1.3698630136986302e-05, + "loss": 2.8478, "step": 20 }, { "epoch": 0.08539709649871904, - "grad_norm": 1.8203125, - "learning_rate": 1.1415525114155251e-05, - "loss": 3.0567, + "grad_norm": 2.375, + "learning_rate": 1.7123287671232875e-05, + "loss": 2.7142, "step": 25 }, { "epoch": 0.10247651579846286, - "grad_norm": 2.1875, - "learning_rate": 1.3698630136986302e-05, - "loss": 2.9917, + "grad_norm": 1.4375, + "learning_rate": 2.0547945205479453e-05, + "loss": 2.5273, "step": 30 }, { "epoch": 0.11955593509820667, - "grad_norm": 2.03125, - "learning_rate": 1.5981735159817352e-05, - "loss": 2.9399, + "grad_norm": 5.34375, + "learning_rate": 2.3972602739726026e-05, + "loss": 2.3905, "step": 35 }, { "epoch": 0.13663535439795046, - "grad_norm": 1.6328125, - "learning_rate": 1.8264840182648402e-05, - "loss": 2.8463, + "grad_norm": 1.53125, + "learning_rate": 2.7397260273972603e-05, + "loss": 2.2615, "step": 40 }, { "epoch": 0.1537147736976943, - "grad_norm": 1.59375, - "learning_rate": 2.0547945205479453e-05, - "loss": 2.7478, + "grad_norm": 21.875, + "learning_rate": 3.082191780821918e-05, + "loss": 2.1359, "step": 45 }, { "epoch": 0.1707941929974381, - "grad_norm": 1.34375, - "learning_rate": 2.2831050228310503e-05, - "loss": 2.6397, + "grad_norm": 15.6875, + "learning_rate": 3.424657534246575e-05, + "loss": 2.0159, "step": 50 }, { "epoch": 0.18787361229718189, - "grad_norm": 1.3515625, - "learning_rate": 2.5114155251141553e-05, - "loss": 2.5394, + "grad_norm": 0.921875, + "learning_rate": 3.767123287671233e-05, + "loss": 1.8994, "step": 55 }, { "epoch": 0.2049530315969257, - "grad_norm": 2.828125, - "learning_rate": 2.7397260273972603e-05, - "loss": 2.4552, + "grad_norm": 0.8203125, + "learning_rate": 4.1095890410958905e-05, + "loss": 1.7873, "step": 60 }, { "epoch": 0.2220324508966695, - "grad_norm": 5.3125, - "learning_rate": 2.9680365296803654e-05, - "loss": 2.3562, + "grad_norm": 0.8671875, + "learning_rate": 4.452054794520548e-05, + "loss": 1.6691, "step": 65 }, { "epoch": 0.23911187019641333, - "grad_norm": 1.9453125, - "learning_rate": 3.1963470319634704e-05, - "loss": 2.2596, + "grad_norm": 0.7109375, + "learning_rate": 4.794520547945205e-05, + "loss": 1.5889, "step": 70 }, { "epoch": 0.2561912894961571, - "grad_norm": 1.1875, - "learning_rate": 3.424657534246575e-05, - "loss": 2.1869, + "grad_norm": 0.8984375, + "learning_rate": 5.136986301369864e-05, + "loss": 1.5329, "step": 75 }, { "epoch": 0.27327070879590093, - "grad_norm": 4.15625, - "learning_rate": 3.6529680365296805e-05, - "loss": 2.1068, + "grad_norm": 0.5390625, + "learning_rate": 5.479452054794521e-05, + "loss": 1.4749, "step": 80 }, { "epoch": 0.29035012809564475, - "grad_norm": 1.890625, - "learning_rate": 3.881278538812785e-05, - "loss": 2.0422, + "grad_norm": 0.31640625, + "learning_rate": 5.821917808219178e-05, + "loss": 1.438, "step": 85 }, { "epoch": 0.3074295473953886, - "grad_norm": 0.9453125, - "learning_rate": 4.1095890410958905e-05, - "loss": 1.9626, + "grad_norm": 0.404296875, + "learning_rate": 6.164383561643835e-05, + "loss": 1.395, "step": 90 }, { "epoch": 0.32450896669513235, - "grad_norm": 2.203125, - "learning_rate": 4.337899543378995e-05, - "loss": 1.8912, + "grad_norm": 0.2578125, + "learning_rate": 6.506849315068494e-05, + "loss": 1.3653, "step": 95 }, { "epoch": 0.3415883859948762, - "grad_norm": 7.5, - "learning_rate": 4.5662100456621006e-05, - "loss": 1.8274, + "grad_norm": 0.46875, + "learning_rate": 6.84931506849315e-05, + "loss": 1.3329, "step": 100 }, { "epoch": 0.35866780529462, - "grad_norm": 1.0, - "learning_rate": 4.794520547945205e-05, - "loss": 1.779, + "grad_norm": 0.5078125, + "learning_rate": 7.191780821917809e-05, + "loss": 1.3221, "step": 105 }, { "epoch": 0.37574722459436377, - "grad_norm": 1.09375, - "learning_rate": 5.0228310502283106e-05, - "loss": 1.7254, + "grad_norm": 0.6015625, + "learning_rate": 7.534246575342466e-05, + "loss": 1.3048, "step": 110 }, { "epoch": 0.3928266438941076, - "grad_norm": 1.1484375, - "learning_rate": 5.251141552511416e-05, - "loss": 1.6696, + "grad_norm": 0.69140625, + "learning_rate": 7.876712328767124e-05, + "loss": 1.2884, "step": 115 }, { "epoch": 0.4099060631938514, - "grad_norm": 0.474609375, - "learning_rate": 5.479452054794521e-05, - "loss": 1.6192, + "grad_norm": 0.408203125, + "learning_rate": 8.219178082191781e-05, + "loss": 1.2687, "step": 120 }, { "epoch": 0.4269854824935952, - "grad_norm": 0.482421875, - "learning_rate": 5.707762557077626e-05, - "loss": 1.5722, + "grad_norm": 0.388671875, + "learning_rate": 8.561643835616438e-05, + "loss": 1.2502, "step": 125 }, { "epoch": 0.444064901793339, - "grad_norm": 0.3515625, - "learning_rate": 5.936073059360731e-05, - "loss": 1.5393, + "grad_norm": 0.5859375, + "learning_rate": 8.904109589041096e-05, + "loss": 1.2416, "step": 130 }, { "epoch": 0.46114432109308284, - "grad_norm": 0.34375, - "learning_rate": 6.164383561643835e-05, - "loss": 1.5068, + "grad_norm": 0.3828125, + "learning_rate": 9.246575342465755e-05, + "loss": 1.2345, "step": 135 }, { "epoch": 0.47822374039282667, - "grad_norm": 0.412109375, - "learning_rate": 6.392694063926941e-05, - "loss": 1.4883, + "grad_norm": 0.7890625, + "learning_rate": 9.58904109589041e-05, + "loss": 1.2381, "step": 140 }, { "epoch": 0.49530315969257044, - "grad_norm": 0.306640625, - "learning_rate": 6.621004566210046e-05, - "loss": 1.4554, + "grad_norm": 0.5625, + "learning_rate": 9.931506849315069e-05, + "loss": 1.2236, "step": 145 }, { "epoch": 0.5123825789923142, - "grad_norm": 0.408203125, - "learning_rate": 6.84931506849315e-05, - "loss": 1.4246, + "grad_norm": 0.5703125, + "learning_rate": 0.00010273972602739728, + "loss": 1.2102, "step": 150 }, { "epoch": 0.5294619982920581, - "grad_norm": 0.376953125, - "learning_rate": 7.077625570776256e-05, - "loss": 1.4059, + "grad_norm": 1.015625, + "learning_rate": 0.00010616438356164384, + "loss": 1.203, "step": 155 }, { "epoch": 0.5465414175918019, - "grad_norm": 0.3671875, - "learning_rate": 7.305936073059361e-05, - "loss": 1.3877, + "grad_norm": 0.33984375, + "learning_rate": 0.00010958904109589041, + "loss": 1.2011, "step": 160 }, { "epoch": 0.5636208368915457, - "grad_norm": 0.3046875, - "learning_rate": 7.534246575342466e-05, - "loss": 1.3705, + "grad_norm": 0.267578125, + "learning_rate": 0.000113013698630137, + "loss": 1.193, "step": 165 }, { "epoch": 0.5807002561912895, - "grad_norm": 0.375, - "learning_rate": 7.76255707762557e-05, - "loss": 1.3634, + "grad_norm": 0.6171875, + "learning_rate": 0.00011643835616438356, + "loss": 1.1933, "step": 170 }, { "epoch": 0.5977796754910333, - "grad_norm": 0.283203125, - "learning_rate": 7.990867579908676e-05, - "loss": 1.3377, + "grad_norm": 0.41015625, + "learning_rate": 0.00011986301369863014, + "loss": 1.1774, "step": 175 }, { "epoch": 0.6148590947907772, - "grad_norm": 0.50390625, - "learning_rate": 8.219178082191781e-05, - "loss": 1.3313, + "grad_norm": 0.734375, + "learning_rate": 0.0001232876712328767, + "loss": 1.177, "step": 180 }, { "epoch": 0.6319385140905209, - "grad_norm": 0.4765625, - "learning_rate": 8.447488584474886e-05, - "loss": 1.3172, + "grad_norm": 0.9609375, + "learning_rate": 0.0001267123287671233, + "loss": 1.1705, "step": 185 }, { "epoch": 0.6490179333902647, - "grad_norm": 0.2451171875, - "learning_rate": 8.67579908675799e-05, - "loss": 1.3015, + "grad_norm": 0.44140625, + "learning_rate": 0.00013013698630136988, + "loss": 1.1612, "step": 190 }, { "epoch": 0.6660973526900086, - "grad_norm": 0.349609375, - "learning_rate": 8.904109589041096e-05, - "loss": 1.3011, + "grad_norm": 0.44140625, + "learning_rate": 0.00013356164383561644, + "loss": 1.167, "step": 195 }, { "epoch": 0.6831767719897524, - "grad_norm": 0.330078125, - "learning_rate": 9.132420091324201e-05, - "loss": 1.2912, + "grad_norm": 0.353515625, + "learning_rate": 0.000136986301369863, + "loss": 1.1616, "step": 200 }, { "epoch": 0.7002561912894961, - "grad_norm": 0.36328125, - "learning_rate": 9.360730593607307e-05, - "loss": 1.2805, + "grad_norm": 1.3203125, + "learning_rate": 0.0001404109589041096, + "loss": 1.1553, "step": 205 }, { "epoch": 0.71733561058924, - "grad_norm": 0.392578125, - "learning_rate": 9.58904109589041e-05, - "loss": 1.2683, + "grad_norm": 0.322265625, + "learning_rate": 0.00014383561643835618, + "loss": 1.1475, "step": 210 }, { "epoch": 0.7344150298889838, - "grad_norm": 0.48046875, - "learning_rate": 9.817351598173516e-05, - "loss": 1.267, + "grad_norm": 0.51953125, + "learning_rate": 0.00014726027397260274, + "loss": 1.1482, "step": 215 }, { "epoch": 0.7514944491887275, - "grad_norm": 0.703125, - "learning_rate": 0.00010045662100456621, - "loss": 1.2562, + "grad_norm": 0.671875, + "learning_rate": 0.00015068493150684933, + "loss": 1.1427, "step": 220 }, { "epoch": 0.7685738684884714, - "grad_norm": 0.5703125, - "learning_rate": 0.00010273972602739728, - "loss": 1.2553, + "grad_norm": 1.1171875, + "learning_rate": 0.00015410958904109589, + "loss": 1.1441, "step": 225 }, { "epoch": 0.7856532877882152, - "grad_norm": 0.419921875, - "learning_rate": 0.00010502283105022832, - "loss": 1.2405, + "grad_norm": 0.6640625, + "learning_rate": 0.00015753424657534247, + "loss": 1.1336, "step": 230 }, { "epoch": 0.802732707087959, - "grad_norm": 0.7421875, - "learning_rate": 0.00010730593607305936, - "loss": 1.2356, + "grad_norm": 0.70703125, + "learning_rate": 0.00016095890410958906, + "loss": 1.1315, "step": 235 }, { "epoch": 0.8198121263877028, - "grad_norm": 0.40625, - "learning_rate": 0.00010958904109589041, - "loss": 1.2318, + "grad_norm": 1.0859375, + "learning_rate": 0.00016438356164383562, + "loss": 1.1316, "step": 240 }, { "epoch": 0.8368915456874466, - "grad_norm": 0.380859375, - "learning_rate": 0.00011187214611872148, - "loss": 1.2329, + "grad_norm": 1.8671875, + "learning_rate": 0.0001678082191780822, + "loss": 1.1381, "step": 245 }, { "epoch": 0.8539709649871904, - "grad_norm": 0.2353515625, - "learning_rate": 0.00011415525114155252, - "loss": 1.2222, + "grad_norm": 1.0703125, + "learning_rate": 0.00017123287671232877, + "loss": 1.1376, "step": 250 }, { "epoch": 0.8710503842869343, - "grad_norm": 0.255859375, - "learning_rate": 0.00011643835616438356, - "loss": 1.216, + "grad_norm": 1.328125, + "learning_rate": 0.00017465753424657536, + "loss": 1.1309, "step": 255 }, { "epoch": 0.888129803586678, - "grad_norm": 0.4921875, - "learning_rate": 0.00011872146118721462, - "loss": 1.2133, + "grad_norm": 1.375, + "learning_rate": 0.00017808219178082192, + "loss": 1.1282, "step": 260 }, { "epoch": 0.9052092228864219, - "grad_norm": 0.345703125, - "learning_rate": 0.00012100456621004568, - "loss": 1.2054, + "grad_norm": 5.78125, + "learning_rate": 0.0001815068493150685, + "loss": 1.1414, "step": 265 }, { "epoch": 0.9222886421861657, - "grad_norm": 0.26171875, - "learning_rate": 0.0001232876712328767, - "loss": 1.2101, + "grad_norm": 0.47265625, + "learning_rate": 0.0001849315068493151, + "loss": 1.1403, "step": 270 }, { "epoch": 0.9393680614859095, - "grad_norm": 0.390625, - "learning_rate": 0.00012557077625570778, - "loss": 1.2071, + "grad_norm": 0.326171875, + "learning_rate": 0.00018835616438356165, + "loss": 1.1352, "step": 275 }, { "epoch": 0.9564474807856533, - "grad_norm": 0.318359375, - "learning_rate": 0.00012785388127853882, - "loss": 1.1957, + "grad_norm": 0.31640625, + "learning_rate": 0.0001917808219178082, + "loss": 1.1201, "step": 280 }, { "epoch": 0.9735269000853971, - "grad_norm": 0.453125, - "learning_rate": 0.00013013698630136988, - "loss": 1.1911, + "grad_norm": 0.3359375, + "learning_rate": 0.0001952054794520548, + "loss": 1.1145, "step": 285 }, { "epoch": 0.9906063193851409, - "grad_norm": 0.6796875, - "learning_rate": 0.00013242009132420092, - "loss": 1.1964, + "grad_norm": 0.31640625, + "learning_rate": 0.00019863013698630139, + "loss": 1.1174, "step": 290 }, { "epoch": 0.9974380871050385, - "eval_loss": 2.489213705062866, - "eval_runtime": 0.8337, - "eval_samples_per_second": 11.994, - "eval_steps_per_second": 1.199, + "eval_loss": 2.4481546878814697, + "eval_runtime": 0.5643, + "eval_samples_per_second": 17.72, + "eval_steps_per_second": 1.772, "step": 292 }, { "epoch": 1.0076857386848848, - "grad_norm": 0.6171875, - "learning_rate": 0.00013470319634703196, - "loss": 1.187, + "grad_norm": 0.41796875, + "learning_rate": 0.00019999935692582106, + "loss": 1.1057, "step": 295 }, { "epoch": 1.0247651579846284, - "grad_norm": 0.65234375, - "learning_rate": 0.000136986301369863, - "loss": 1.1809, + "grad_norm": 0.314453125, + "learning_rate": 0.00019999542705801296, + "loss": 1.0972, "step": 300 }, { "epoch": 1.0418445772843723, - "grad_norm": 0.69921875, - "learning_rate": 0.00013926940639269407, - "loss": 1.1823, + "grad_norm": 0.412109375, + "learning_rate": 0.00019998792472605885, + "loss": 1.1012, "step": 305 }, { "epoch": 1.0589239965841162, - "grad_norm": 0.5546875, - "learning_rate": 0.0001415525114155251, - "loss": 1.1685, + "grad_norm": 0.32421875, + "learning_rate": 0.00019997685019798912, + "loss": 1.0859, "step": 310 }, { "epoch": 1.0760034158838598, - "grad_norm": 0.58984375, - "learning_rate": 0.00014383561643835618, - "loss": 1.1797, + "grad_norm": 0.578125, + "learning_rate": 0.00019996220386945537, + "loss": 1.0973, "step": 315 }, { "epoch": 1.0930828351836037, - "grad_norm": 0.921875, - "learning_rate": 0.00014611872146118722, - "loss": 1.1798, + "grad_norm": 0.53515625, + "learning_rate": 0.00019994398626371643, + "loss": 1.0961, "step": 320 }, { "epoch": 1.1101622544833476, - "grad_norm": 0.625, - "learning_rate": 0.00014840182648401829, - "loss": 1.1722, + "grad_norm": 0.271484375, + "learning_rate": 0.0001999221980316194, + "loss": 1.0901, "step": 325 }, { "epoch": 1.1272416737830913, - "grad_norm": 0.3671875, - "learning_rate": 0.00015068493150684933, - "loss": 1.1595, + "grad_norm": 0.546875, + "learning_rate": 0.00019989683995157677, + "loss": 1.0761, "step": 330 }, { "epoch": 1.1443210930828351, - "grad_norm": 0.484375, - "learning_rate": 0.00015296803652968037, - "loss": 1.1633, + "grad_norm": 0.54296875, + "learning_rate": 0.0001998679129295382, + "loss": 1.082, "step": 335 }, { "epoch": 1.161400512382579, - "grad_norm": 0.412109375, - "learning_rate": 0.0001552511415525114, - "loss": 1.1587, + "grad_norm": 0.44921875, + "learning_rate": 0.0001998354179989585, + "loss": 1.0788, "step": 340 }, { "epoch": 1.1784799316823227, - "grad_norm": 0.3203125, - "learning_rate": 0.00015753424657534247, - "loss": 1.1557, + "grad_norm": 0.263671875, + "learning_rate": 0.00019979935632076048, + "loss": 1.0745, "step": 345 }, { "epoch": 1.1955593509820666, - "grad_norm": 0.365234375, - "learning_rate": 0.00015981735159817351, - "loss": 1.1586, + "grad_norm": 0.302734375, + "learning_rate": 0.00019975972918329356, + "loss": 1.0775, "step": 350 }, { "epoch": 1.2126387702818104, - "grad_norm": 0.423828125, - "learning_rate": 0.00016210045662100458, - "loss": 1.1559, + "grad_norm": 0.28515625, + "learning_rate": 0.0001997165380022878, + "loss": 1.0761, "step": 355 }, { "epoch": 1.2297181895815543, - "grad_norm": 0.375, - "learning_rate": 0.00016438356164383562, - "loss": 1.1575, + "grad_norm": 0.3046875, + "learning_rate": 0.00019966978432080316, + "loss": 1.0789, "step": 360 }, { "epoch": 1.246797608881298, - "grad_norm": 0.6015625, - "learning_rate": 0.0001666666666666667, - "loss": 1.1559, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019961946980917456, + "loss": 1.0762, "step": 365 }, { "epoch": 1.2638770281810419, - "grad_norm": 0.328125, - "learning_rate": 0.00016894977168949773, - "loss": 1.1528, + "grad_norm": 0.46875, + "learning_rate": 0.00019956559626495212, + "loss": 1.0748, "step": 370 }, { "epoch": 1.2809564474807855, - "grad_norm": 0.404296875, - "learning_rate": 0.00017123287671232877, - "loss": 1.1541, + "grad_norm": 0.2734375, + "learning_rate": 0.00019950816561283685, + "loss": 1.0756, "step": 375 }, { "epoch": 1.2980358667805294, - "grad_norm": 0.4453125, - "learning_rate": 0.0001735159817351598, - "loss": 1.147, + "grad_norm": 0.36328125, + "learning_rate": 0.00019944717990461207, + "loss": 1.0694, "step": 380 }, { "epoch": 1.3151152860802733, - "grad_norm": 0.62890625, - "learning_rate": 0.00017579908675799088, - "loss": 1.1434, + "grad_norm": 0.259765625, + "learning_rate": 0.00019938264131907, + "loss": 1.0654, "step": 385 }, { "epoch": 1.332194705380017, - "grad_norm": 0.369140625, - "learning_rate": 0.00017808219178082192, - "loss": 1.1408, + "grad_norm": 0.375, + "learning_rate": 0.00019931455216193382, + "loss": 1.0645, "step": 390 }, { "epoch": 1.3492741246797608, - "grad_norm": 0.51953125, - "learning_rate": 0.00018036529680365298, - "loss": 1.1357, + "grad_norm": 0.64453125, + "learning_rate": 0.00019924291486577559, + "loss": 1.0613, "step": 395 }, { "epoch": 1.3663535439795047, - "grad_norm": 0.48046875, - "learning_rate": 0.00018264840182648402, - "loss": 1.1451, + "grad_norm": 0.392578125, + "learning_rate": 0.000199167731989929, + "loss": 1.0689, "step": 400 }, { "epoch": 1.3834329632792486, - "grad_norm": 0.84375, - "learning_rate": 0.0001849315068493151, - "loss": 1.1419, + "grad_norm": 0.39453125, + "learning_rate": 0.00019908900622039822, + "loss": 1.065, "step": 405 }, { "epoch": 1.4005123825789922, - "grad_norm": 0.8515625, - "learning_rate": 0.00018721461187214613, - "loss": 1.1435, + "grad_norm": 0.33203125, + "learning_rate": 0.00019900674036976173, + "loss": 1.0668, "step": 410 }, { "epoch": 1.4175918018787361, - "grad_norm": 0.4453125, - "learning_rate": 0.00018949771689497717, - "loss": 1.139, + "grad_norm": 0.32421875, + "learning_rate": 0.0001989209373770719, + "loss": 1.0628, "step": 415 }, { "epoch": 1.43467122117848, - "grad_norm": 0.34375, - "learning_rate": 0.0001917808219178082, - "loss": 1.1381, + "grad_norm": 0.421875, + "learning_rate": 0.00019883160030775016, + "loss": 1.0617, "step": 420 }, { "epoch": 1.4517506404782237, - "grad_norm": 0.640625, - "learning_rate": 0.00019406392694063928, - "loss": 1.1345, + "grad_norm": 0.294921875, + "learning_rate": 0.00019873873235347719, + "loss": 1.0598, "step": 425 }, { "epoch": 1.4688300597779675, - "grad_norm": 0.6328125, - "learning_rate": 0.00019634703196347032, - "loss": 1.1293, + "grad_norm": 0.3046875, + "learning_rate": 0.00019864233683207906, + "loss": 1.0536, "step": 430 }, { "epoch": 1.4859094790777114, - "grad_norm": 0.6796875, - "learning_rate": 0.00019863013698630139, - "loss": 1.1328, + "grad_norm": 0.251953125, + "learning_rate": 0.0001985424171874087, + "loss": 1.0565, "step": 435 }, { "epoch": 1.5029888983774553, - "grad_norm": 0.287109375, - "learning_rate": 0.00019999987297289245, - "loss": 1.1359, + "grad_norm": 0.333984375, + "learning_rate": 0.00019843897698922284, + "loss": 1.0613, "step": 440 }, { "epoch": 1.520068317677199, - "grad_norm": 0.46875, - "learning_rate": 0.00019999844392163855, - "loss": 1.1271, + "grad_norm": 0.32421875, + "learning_rate": 0.0001983320199330545, + "loss": 1.0517, "step": 445 }, { "epoch": 1.5371477369769426, - "grad_norm": 0.306640625, - "learning_rate": 0.00019999542705801296, - "loss": 1.1346, + "grad_norm": 0.404296875, + "learning_rate": 0.00019822154984008088, + "loss": 1.0589, "step": 450 }, { "epoch": 1.5542271562766867, - "grad_norm": 0.435546875, - "learning_rate": 0.0001999908224299185, - "loss": 1.1278, + "grad_norm": 0.353515625, + "learning_rate": 0.00019810757065698688, + "loss": 1.0517, "step": 455 }, { "epoch": 1.5713065755764304, - "grad_norm": 0.734375, - "learning_rate": 0.00019998463011046926, - "loss": 1.1303, + "grad_norm": 0.373046875, + "learning_rate": 0.0001979900864558242, + "loss": 1.0547, "step": 460 }, { "epoch": 1.588385994876174, - "grad_norm": 0.50390625, - "learning_rate": 0.00019997685019798912, - "loss": 1.1288, + "grad_norm": 0.408203125, + "learning_rate": 0.0001978691014338658, + "loss": 1.0537, "step": 465 }, { "epoch": 1.6054654141759181, - "grad_norm": 0.37109375, - "learning_rate": 0.00019996748281601038, - "loss": 1.1171, + "grad_norm": 0.48046875, + "learning_rate": 0.00019774461991345577, + "loss": 1.0459, "step": 470 }, { "epoch": 1.6225448334756618, - "grad_norm": 0.62109375, - "learning_rate": 0.00019995652811327186, - "loss": 1.1199, + "grad_norm": 0.73046875, + "learning_rate": 0.0001976166463418552, + "loss": 1.0477, "step": 475 }, { "epoch": 1.6396242527754057, - "grad_norm": 0.58203125, - "learning_rate": 0.00019994398626371643, - "loss": 1.119, + "grad_norm": 0.6796875, + "learning_rate": 0.00019748518529108316, + "loss": 1.0472, "step": 480 }, { "epoch": 1.6567036720751496, - "grad_norm": 0.98828125, - "learning_rate": 0.00019992985746648812, - "loss": 1.1249, + "grad_norm": 0.58984375, + "learning_rate": 0.0001973502414577533, + "loss": 1.0521, "step": 485 }, { "epoch": 1.6737830913748932, - "grad_norm": 0.45703125, - "learning_rate": 0.0001999141419459293, - "loss": 1.1121, + "grad_norm": 0.375, + "learning_rate": 0.00019721181966290613, + "loss": 1.0394, "step": 490 }, { "epoch": 1.690862510674637, - "grad_norm": 0.5078125, - "learning_rate": 0.00019989683995157677, - "loss": 1.1038, + "grad_norm": 0.474609375, + "learning_rate": 0.00019706992485183684, + "loss": 1.0328, "step": 495 }, { "epoch": 1.707941929974381, - "grad_norm": 0.365234375, - "learning_rate": 0.00019987795175815807, - "loss": 1.111, + "grad_norm": 0.302734375, + "learning_rate": 0.00019692456209391846, + "loss": 1.0382, "step": 500 }, { "epoch": 1.7250213492741246, - "grad_norm": 0.31640625, - "learning_rate": 0.00019985747766558692, - "loss": 1.113, + "grad_norm": 0.294921875, + "learning_rate": 0.00019677573658242087, + "loss": 1.0418, "step": 505 }, { "epoch": 1.7421007685738685, - "grad_norm": 0.396484375, - "learning_rate": 0.0001998354179989585, - "loss": 1.1138, + "grad_norm": 0.3984375, + "learning_rate": 0.0001966234536343253, + "loss": 1.0416, "step": 510 }, { "epoch": 1.7591801878736124, - "grad_norm": 0.33984375, - "learning_rate": 0.00019981177310854448, - "loss": 1.1114, + "grad_norm": 0.294921875, + "learning_rate": 0.0001964677186901342, + "loss": 1.0399, "step": 515 }, { "epoch": 1.776259607173356, - "grad_norm": 0.353515625, - "learning_rate": 0.0001997865433697871, - "loss": 1.1114, + "grad_norm": 0.30078125, + "learning_rate": 0.00019630853731367713, + "loss": 1.0404, "step": 520 }, { "epoch": 1.7933390264731, - "grad_norm": 0.44921875, - "learning_rate": 0.00019975972918329356, - "loss": 1.1058, + "grad_norm": 0.365234375, + "learning_rate": 0.00019614591519191165, + "loss": 1.0349, "step": 525 }, { "epoch": 1.8104184457728438, - "grad_norm": 0.47265625, - "learning_rate": 0.00019973133097482947, - "loss": 1.1036, + "grad_norm": 0.39453125, + "learning_rate": 0.00019597985813472052, + "loss": 1.0303, "step": 530 }, { "epoch": 1.8274978650725875, - "grad_norm": 0.427734375, - "learning_rate": 0.00019970134919531206, - "loss": 1.1033, + "grad_norm": 0.353515625, + "learning_rate": 0.00019581037207470382, + "loss": 1.0318, "step": 535 }, { "epoch": 1.8445772843723314, - "grad_norm": 0.3828125, - "learning_rate": 0.00019966978432080316, - "loss": 1.1096, + "grad_norm": 0.3984375, + "learning_rate": 0.0001956374630669672, + "loss": 1.0386, "step": 540 }, { "epoch": 1.8616567036720753, - "grad_norm": 0.35546875, - "learning_rate": 0.00019963663685250156, - "loss": 1.0954, + "grad_norm": 0.359375, + "learning_rate": 0.00019546113728890541, + "loss": 1.0252, "step": 545 }, { "epoch": 1.878736122971819, - "grad_norm": 0.306640625, - "learning_rate": 0.00019960190731673505, - "loss": 1.1034, + "grad_norm": 0.3046875, + "learning_rate": 0.00019528140103998177, + "loss": 1.0329, "step": 550 }, { "epoch": 1.8958155422715628, - "grad_norm": 0.52734375, - "learning_rate": 0.00019956559626495212, - "loss": 1.1083, + "grad_norm": 0.265625, + "learning_rate": 0.00019509826074150298, + "loss": 1.0385, "step": 555 }, { "epoch": 1.9128949615713067, - "grad_norm": 0.375, - "learning_rate": 0.00019952770427371304, - "loss": 1.1024, + "grad_norm": 0.2734375, + "learning_rate": 0.00019491172293638968, + "loss": 1.0322, "step": 560 }, { "epoch": 1.9299743808710503, - "grad_norm": 0.40625, - "learning_rate": 0.0001994882319446809, - "loss": 1.1005, + "grad_norm": 0.25, + "learning_rate": 0.00019472179428894288, + "loss": 1.0296, "step": 565 }, { "epoch": 1.9470538001707942, - "grad_norm": 0.3359375, - "learning_rate": 0.00019944717990461207, - "loss": 1.1137, + "grad_norm": 0.28515625, + "learning_rate": 0.0001945284815846057, + "loss": 1.0434, "step": 570 }, { "epoch": 1.964133219470538, - "grad_norm": 0.30859375, - "learning_rate": 0.00019940454880534598, - "loss": 1.094, + "grad_norm": 0.2890625, + "learning_rate": 0.00019433179172972102, + "loss": 1.0228, "step": 575 }, { "epoch": 1.9812126387702818, - "grad_norm": 0.2890625, - "learning_rate": 0.00019936033932379504, - "loss": 1.097, + "grad_norm": 0.25390625, + "learning_rate": 0.00019413173175128473, + "loss": 1.0274, "step": 580 }, { "epoch": 1.9982920580700256, - "grad_norm": 0.55859375, - "learning_rate": 0.00019931455216193382, - "loss": 1.0954, + "grad_norm": 0.4765625, + "learning_rate": 0.00019392830879669463, + "loss": 1.0252, "step": 585 }, { "epoch": 1.9982920580700256, - "eval_loss": 2.4542269706726074, - "eval_runtime": 0.5521, - "eval_samples_per_second": 18.112, - "eval_steps_per_second": 1.811, + "eval_loss": 2.451388120651245, + "eval_runtime": 0.5458, + "eval_samples_per_second": 18.323, + "eval_steps_per_second": 1.832, "step": 585 }, { "epoch": 2.0153714773697695, - "grad_norm": 0.35546875, - "learning_rate": 0.00019926718804678785, - "loss": 1.0848, + "grad_norm": 0.474609375, + "learning_rate": 0.00019372153013349523, + "loss": 1.0051, "step": 590 }, { "epoch": 2.032450896669513, - "grad_norm": 0.458984375, - "learning_rate": 0.0001992182477304221, - "loss": 1.0899, + "grad_norm": 0.365234375, + "learning_rate": 0.00019351140314911795, + "loss": 1.0105, "step": 595 }, { "epoch": 2.049530315969257, - "grad_norm": 0.375, - "learning_rate": 0.000199167731989929, - "loss": 1.0937, + "grad_norm": 0.416015625, + "learning_rate": 0.00019329793535061723, + "loss": 1.0135, "step": 600 }, { "epoch": 2.066609735269001, - "grad_norm": 0.74609375, - "learning_rate": 0.00019911564162741633, - "loss": 1.0866, + "grad_norm": 0.2890625, + "learning_rate": 0.00019308113436440242, + "loss": 1.0062, "step": 605 }, { "epoch": 2.0836891545687446, - "grad_norm": 0.55078125, - "learning_rate": 0.00019906197746999408, - "loss": 1.081, - "step": 610 - }, + "grad_norm": 0.48828125, + "learning_rate": 0.0001928610079359652, + "loss": 1.0019, + "step": 610 + }, { "epoch": 2.1007685738684883, - "grad_norm": 0.33984375, - "learning_rate": 0.00019900674036976173, - "loss": 1.0857, + "grad_norm": 0.330078125, + "learning_rate": 0.00019263756392960294, + "loss": 1.0048, "step": 615 }, { "epoch": 2.1178479931682324, - "grad_norm": 0.359375, - "learning_rate": 0.00019894993120379435, - "loss": 1.0894, + "grad_norm": 0.271484375, + "learning_rate": 0.00019241081032813772, + "loss": 1.0094, "step": 620 }, { "epoch": 2.134927412467976, - "grad_norm": 0.28515625, - "learning_rate": 0.000198891550874129, - "loss": 1.0803, + "grad_norm": 0.265625, + "learning_rate": 0.00019218075523263104, + "loss": 1.0014, "step": 625 }, { "epoch": 2.1520068317677197, - "grad_norm": 0.328125, - "learning_rate": 0.00019883160030775016, - "loss": 1.0899, + "grad_norm": 0.333984375, + "learning_rate": 0.00019194740686209464, + "loss": 1.0085, "step": 630 }, { "epoch": 2.1690862510674638, - "grad_norm": 0.423828125, - "learning_rate": 0.0001987700804565752, - "loss": 1.0826, + "grad_norm": 0.40625, + "learning_rate": 0.0001917107735531966, + "loss": 1.0014, "step": 635 }, { "epoch": 2.1861656703672074, - "grad_norm": 0.447265625, - "learning_rate": 0.00019870699229743911, - "loss": 1.0842, + "grad_norm": 0.455078125, + "learning_rate": 0.0001914708637599636, + "loss": 1.0056, "step": 640 }, { "epoch": 2.203245089666951, - "grad_norm": 2.359375, - "learning_rate": 0.00019864233683207906, - "loss": 1.0816, + "grad_norm": 0.46875, + "learning_rate": 0.00019122768605347892, + "loss": 0.998, "step": 645 }, { "epoch": 2.220324508966695, - "grad_norm": 0.37890625, - "learning_rate": 0.0001985761150871185, - "loss": 1.0883, + "grad_norm": 0.265625, + "learning_rate": 0.00019098124912157632, + "loss": 1.0007, "step": 650 }, { "epoch": 2.237403928266439, - "grad_norm": 0.4296875, - "learning_rate": 0.00019850832811405087, - "loss": 1.089, + "grad_norm": 0.244140625, + "learning_rate": 0.00019073156176852935, + "loss": 1.0046, "step": 655 }, { "epoch": 2.2544833475661825, - "grad_norm": 0.3125, - "learning_rate": 0.00019843897698922284, - "loss": 1.0905, + "grad_norm": 0.30859375, + "learning_rate": 0.00019047863291473717, + "loss": 1.0084, "step": 660 }, { "epoch": 2.2715627668659266, - "grad_norm": 0.376953125, - "learning_rate": 0.00019836806281381733, - "loss": 1.0817, + "grad_norm": 0.333984375, + "learning_rate": 0.00019022247159640557, + "loss": 1.0006, "step": 665 }, { "epoch": 2.2886421861656703, - "grad_norm": 0.3671875, - "learning_rate": 0.00019829558671383585, - "loss": 1.0857, + "grad_norm": 0.49609375, + "learning_rate": 0.00018996308696522433, + "loss": 1.0057, "step": 670 }, { "epoch": 2.305721605465414, - "grad_norm": 0.37890625, - "learning_rate": 0.00019822154984008088, - "loss": 1.0824, + "grad_norm": 0.396484375, + "learning_rate": 0.00018970048828804016, + "loss": 1.0019, "step": 675 }, { "epoch": 2.322801024765158, - "grad_norm": 0.42578125, - "learning_rate": 0.00019814595336813725, - "loss": 1.0849, + "grad_norm": 0.443359375, + "learning_rate": 0.0001894346849465257, + "loss": 1.0054, "step": 680 }, { "epoch": 2.3398804440649017, - "grad_norm": 0.53125, - "learning_rate": 0.0001980687984983538, - "loss": 1.0807, + "grad_norm": 0.390625, + "learning_rate": 0.0001891656864368442, + "loss": 1.0021, "step": 685 }, { "epoch": 2.3569598633646454, - "grad_norm": 0.578125, - "learning_rate": 0.0001979900864558242, - "loss": 1.0732, + "grad_norm": 0.28515625, + "learning_rate": 0.00018889350236931055, + "loss": 0.9956, "step": 690 }, { "epoch": 2.3740392826643895, - "grad_norm": 0.310546875, - "learning_rate": 0.00019790981849036746, - "loss": 1.0855, + "grad_norm": 0.412109375, + "learning_rate": 0.00018861814246804755, + "loss": 1.0063, "step": 695 }, { "epoch": 2.391118701964133, - "grad_norm": 0.341796875, - "learning_rate": 0.00019782799587650805, - "loss": 1.0797, + "grad_norm": 0.27734375, + "learning_rate": 0.00018833961657063885, + "loss": 1.0013, "step": 700 }, { "epoch": 2.408198121263877, - "grad_norm": 0.341796875, - "learning_rate": 0.00019774461991345577, - "loss": 1.0728, + "grad_norm": 0.255859375, + "learning_rate": 0.00018805793462777734, + "loss": 0.9951, "step": 705 }, { "epoch": 2.425277540563621, - "grad_norm": 0.353515625, - "learning_rate": 0.00019765969192508508, - "loss": 1.0805, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001877731067029096, + "loss": 1.0019, "step": 710 }, { "epoch": 2.4423569598633645, - "grad_norm": 0.375, - "learning_rate": 0.00019757321325991414, - "loss": 1.074, + "grad_norm": 0.306640625, + "learning_rate": 0.00018748514297187648, + "loss": 0.995, "step": 715 }, { "epoch": 2.4594363791631086, - "grad_norm": 0.33984375, - "learning_rate": 0.00019748518529108316, - "loss": 1.0794, + "grad_norm": 0.31640625, + "learning_rate": 0.00018719405372254948, + "loss": 1.002, "step": 720 }, { "epoch": 2.4765157984628523, - "grad_norm": 0.56640625, - "learning_rate": 0.00019739560941633294, - "loss": 1.069, + "grad_norm": 0.392578125, + "learning_rate": 0.00018689984935446317, + "loss": 0.9942, "step": 725 }, { "epoch": 2.493595217762596, - "grad_norm": 0.33984375, - "learning_rate": 0.00019730448705798239, - "loss": 1.0763, + "grad_norm": 0.34765625, + "learning_rate": 0.00018660254037844388, + "loss": 1.0012, "step": 730 }, { "epoch": 2.5106746370623396, - "grad_norm": 0.333984375, - "learning_rate": 0.00019721181966290613, - "loss": 1.0798, + "grad_norm": 0.3203125, + "learning_rate": 0.00018630213741623383, + "loss": 1.002, "step": 735 }, { "epoch": 2.5277540563620837, - "grad_norm": 0.67578125, - "learning_rate": 0.00019711760870251143, - "loss": 1.075, + "grad_norm": 0.263671875, + "learning_rate": 0.00018599865120011192, + "loss": 0.9975, "step": 740 }, { "epoch": 2.5448334756618274, - "grad_norm": 0.9296875, - "learning_rate": 0.00019702185567271486, - "loss": 1.0775, + "grad_norm": 0.373046875, + "learning_rate": 0.00018569209257251026, + "loss": 0.9996, "step": 745 }, { "epoch": 2.561912894961571, - "grad_norm": 0.5703125, - "learning_rate": 0.00019692456209391846, - "loss": 1.0773, + "grad_norm": 0.306640625, + "learning_rate": 0.00018538247248562674, + "loss": 1.0001, "step": 750 }, { "epoch": 2.578992314261315, - "grad_norm": 0.443359375, - "learning_rate": 0.0001968257295109858, - "loss": 1.0713, + "grad_norm": 0.345703125, + "learning_rate": 0.00018506980200103375, + "loss": 0.9954, "step": 755 }, { "epoch": 2.596071733561059, - "grad_norm": 0.328125, - "learning_rate": 0.0001967253594932173, - "loss": 1.0719, + "grad_norm": 0.265625, + "learning_rate": 0.00018475409228928312, + "loss": 0.9945, "step": 760 }, { "epoch": 2.6131511528608025, - "grad_norm": 0.32421875, - "learning_rate": 0.0001966234536343253, - "loss": 1.0674, + "grad_norm": 0.267578125, + "learning_rate": 0.00018443535462950688, + "loss": 0.9918, "step": 765 }, { "epoch": 2.6302305721605466, - "grad_norm": 0.3828125, - "learning_rate": 0.00019652001355240878, - "loss": 1.0768, + "grad_norm": 0.375, + "learning_rate": 0.0001841136004090144, + "loss": 1.0007, "step": 770 }, { "epoch": 2.6473099914602902, - "grad_norm": 0.28515625, - "learning_rate": 0.00019641504088992778, - "loss": 1.0785, + "grad_norm": 0.251953125, + "learning_rate": 0.00018378884112288542, + "loss": 1.0026, "step": 775 }, { "epoch": 2.664389410760034, - "grad_norm": 0.322265625, - "learning_rate": 0.00019630853731367713, - "loss": 1.0716, + "grad_norm": 0.26953125, + "learning_rate": 0.00018346108837355972, + "loss": 0.995, "step": 780 }, { "epoch": 2.681468830059778, - "grad_norm": 0.515625, - "learning_rate": 0.00019620050451476007, - "loss": 1.0674, + "grad_norm": 0.37109375, + "learning_rate": 0.0001831303538704221, + "loss": 0.9916, "step": 785 }, { "epoch": 2.6985482493595216, - "grad_norm": 0.298828125, - "learning_rate": 0.0001960909442085615, - "loss": 1.0658, + "grad_norm": 0.275390625, + "learning_rate": 0.00018279664942938447, + "loss": 0.9902, "step": 790 }, { "epoch": 2.7156276686592657, - "grad_norm": 0.3046875, - "learning_rate": 0.00019597985813472052, - "loss": 1.0746, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018245998697246352, + "loss": 1.0003, "step": 795 }, { "epoch": 2.7327070879590094, - "grad_norm": 0.75, - "learning_rate": 0.00019586724805710306, - "loss": 1.0696, + "grad_norm": 0.28125, + "learning_rate": 0.00018212037852735486, + "loss": 0.9933, "step": 800 }, { "epoch": 2.749786507258753, - "grad_norm": 0.5, - "learning_rate": 0.00019575311576377366, - "loss": 1.0695, + "grad_norm": 0.283203125, + "learning_rate": 0.00018177783622700327, + "loss": 0.9934, "step": 805 }, { "epoch": 2.766865926558497, - "grad_norm": 0.353515625, - "learning_rate": 0.0001956374630669672, - "loss": 1.0633, + "grad_norm": 0.232421875, + "learning_rate": 0.0001814323723091692, + "loss": 0.9887, "step": 810 }, { "epoch": 2.783945345858241, - "grad_norm": 0.330078125, - "learning_rate": 0.0001955202918030601, - "loss": 1.069, + "grad_norm": 0.28125, + "learning_rate": 0.00018108399911599167, + "loss": 0.995, "step": 815 }, { "epoch": 2.8010247651579845, - "grad_norm": 0.5, - "learning_rate": 0.00019540160383254107, - "loss": 1.0636, + "grad_norm": 0.30078125, + "learning_rate": 0.00018073272909354727, + "loss": 0.9897, "step": 820 }, { "epoch": 2.8181041844577286, - "grad_norm": 0.380859375, - "learning_rate": 0.00019528140103998177, - "loss": 1.0681, + "grad_norm": 0.27734375, + "learning_rate": 0.00018037857479140547, + "loss": 0.9923, "step": 825 }, { "epoch": 2.8351836037574722, - "grad_norm": 0.375, - "learning_rate": 0.00019515968533400673, - "loss": 1.0637, + "grad_norm": 0.31640625, + "learning_rate": 0.00018002154886218033, + "loss": 0.9877, "step": 830 }, { "epoch": 2.852263023057216, - "grad_norm": 0.32421875, - "learning_rate": 0.000195036458647263, - "loss": 1.0677, + "grad_norm": 0.294921875, + "learning_rate": 0.00017966166406107846, + "loss": 0.9936, "step": 835 }, { "epoch": 2.86934244235696, - "grad_norm": 0.376953125, - "learning_rate": 0.00019491172293638968, - "loss": 1.0658, + "grad_norm": 0.28125, + "learning_rate": 0.00017929893324544332, + "loss": 0.9931, "step": 840 }, { "epoch": 2.8864218616567037, - "grad_norm": 0.4453125, - "learning_rate": 0.00019478548018198657, - "loss": 1.0671, + "grad_norm": 0.359375, + "learning_rate": 0.00017893336937429581, + "loss": 0.992, "step": 845 }, { "epoch": 2.9035012809564473, - "grad_norm": 0.349609375, - "learning_rate": 0.00019465773238858298, - "loss": 1.0637, + "grad_norm": 0.26171875, + "learning_rate": 0.00017856498550787144, + "loss": 0.9896, "step": 850 }, { "epoch": 2.9205807002561914, - "grad_norm": 0.55859375, - "learning_rate": 0.0001945284815846057, - "loss": 1.0638, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001781937948071536, + "loss": 0.9899, "step": 855 }, { "epoch": 2.937660119555935, - "grad_norm": 0.357421875, - "learning_rate": 0.00019439772982234697, - "loss": 1.0623, + "grad_norm": 0.283203125, + "learning_rate": 0.00017781981053340337, + "loss": 0.9869, "step": 860 }, { "epoch": 2.9547395388556787, - "grad_norm": 0.39453125, - "learning_rate": 0.0001942654791779317, - "loss": 1.0597, + "grad_norm": 0.43359375, + "learning_rate": 0.00017744304604768588, + "loss": 0.9865, "step": 865 }, { "epoch": 2.971818958155423, - "grad_norm": 0.482421875, - "learning_rate": 0.00019413173175128473, - "loss": 1.0629, + "grad_norm": 0.423828125, + "learning_rate": 0.00017706351481039284, + "loss": 0.9885, "step": 870 }, { "epoch": 2.9888983774551665, - "grad_norm": 0.5390625, - "learning_rate": 0.0001939964896660972, - "loss": 1.0621, + "grad_norm": 0.3125, + "learning_rate": 0.00017668123038076163, + "loss": 0.988, "step": 875 }, { "epoch": 2.9991460290350127, - "eval_loss": 2.453326463699341, - "eval_runtime": 0.5495, - "eval_samples_per_second": 18.197, - "eval_steps_per_second": 1.82, + "eval_loss": 2.468273639678955, + "eval_runtime": 0.5562, + "eval_samples_per_second": 17.978, + "eval_steps_per_second": 1.798, "step": 878 }, { "epoch": 3.00597779675491, - "grad_norm": 0.28515625, - "learning_rate": 0.0001938597550697932, - "loss": 1.0542, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017629620641639103, + "loss": 0.9741, "step": 880 }, { "epoch": 3.0230572160546543, - "grad_norm": 0.380859375, - "learning_rate": 0.00019372153013349523, - "loss": 1.0482, + "grad_norm": 0.337890625, + "learning_rate": 0.00017590845667275312, + "loss": 0.9621, "step": 885 }, { "epoch": 3.040136635354398, - "grad_norm": 0.73046875, - "learning_rate": 0.00019358181705199015, - "loss": 1.0565, + "grad_norm": 0.390625, + "learning_rate": 0.00017551799500270198, + "loss": 0.968, "step": 890 }, { "epoch": 3.0572160546541416, - "grad_norm": 0.58203125, - "learning_rate": 0.00019344061804369412, - "loss": 1.0564, + "grad_norm": 0.326171875, + "learning_rate": 0.00017512483535597867, + "loss": 0.9683, "step": 895 }, { "epoch": 3.0742954739538857, - "grad_norm": 0.486328125, - "learning_rate": 0.00019329793535061723, - "loss": 1.0544, + "grad_norm": 0.25, + "learning_rate": 0.00017472899177871297, + "loss": 0.9671, "step": 900 }, { "epoch": 3.0913748932536294, - "grad_norm": 0.341796875, - "learning_rate": 0.00019315377123832827, - "loss": 1.042, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001743304784129214, + "loss": 0.9563, "step": 905 }, { "epoch": 3.108454312553373, - "grad_norm": 0.4453125, - "learning_rate": 0.00019300812799591846, - "loss": 1.0552, + "grad_norm": 0.294921875, + "learning_rate": 0.00017392930949600217, + "loss": 0.9678, "step": 910 }, { "epoch": 3.125533731853117, - "grad_norm": 0.306640625, - "learning_rate": 0.0001928610079359652, - "loss": 1.044, + "grad_norm": 0.240234375, + "learning_rate": 0.0001735254993602264, + "loss": 0.9594, "step": 915 }, { "epoch": 3.1426131511528608, - "grad_norm": 0.279296875, - "learning_rate": 0.00019271241339449536, - "loss": 1.0546, + "grad_norm": 0.283203125, + "learning_rate": 0.00017311906243222614, + "loss": 0.9691, "step": 920 }, { "epoch": 3.1596925704526044, - "grad_norm": 0.412109375, - "learning_rate": 0.00019256234673094814, - "loss": 1.0553, + "grad_norm": 0.2578125, + "learning_rate": 0.0001727100132324789, + "loss": 0.9694, "step": 925 }, { "epoch": 3.1767719897523485, - "grad_norm": 0.314453125, - "learning_rate": 0.00019241081032813772, - "loss": 1.0522, + "grad_norm": 0.29296875, + "learning_rate": 0.00017229836637478902, + "loss": 0.9678, "step": 930 }, { "epoch": 3.193851409052092, - "grad_norm": 0.3046875, - "learning_rate": 0.00019225780659221523, - "loss": 1.0579, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017188413656576534, + "loss": 0.972, "step": 935 }, { "epoch": 3.210930828351836, - "grad_norm": 0.34765625, - "learning_rate": 0.00019210333795263075, - "loss": 1.0516, + "grad_norm": 0.267578125, + "learning_rate": 0.00017146733860429612, + "loss": 0.9661, "step": 940 }, { "epoch": 3.22801024765158, - "grad_norm": 0.345703125, - "learning_rate": 0.00019194740686209464, - "loss": 1.0428, + "grad_norm": 0.2578125, + "learning_rate": 0.00017104798738101993, + "loss": 0.9567, "step": 945 }, { "epoch": 3.2450896669513236, - "grad_norm": 0.61328125, - "learning_rate": 0.00019179001579653853, - "loss": 1.045, + "grad_norm": 0.28515625, + "learning_rate": 0.00017062609787779403, + "loss": 0.9605, "step": 950 }, { "epoch": 3.2621690862510673, - "grad_norm": 0.5390625, - "learning_rate": 0.00019163116725507619, - "loss": 1.0534, + "grad_norm": 0.259765625, + "learning_rate": 0.00017020168516715894, + "loss": 0.9678, "step": 955 }, { "epoch": 3.2792485055508114, - "grad_norm": 0.427734375, - "learning_rate": 0.0001914708637599636, - "loss": 1.0463, + "grad_norm": 0.271484375, + "learning_rate": 0.00016977476441179992, + "loss": 0.961, "step": 960 }, { "epoch": 3.296327924850555, - "grad_norm": 0.37890625, - "learning_rate": 0.00019130910785655907, - "loss": 1.0482, + "grad_norm": 0.298828125, + "learning_rate": 0.00016934535086400538, + "loss": 0.9657, "step": 965 }, { "epoch": 3.313407344150299, - "grad_norm": 0.314453125, - "learning_rate": 0.00019114590211328288, - "loss": 1.0431, + "grad_norm": 0.2734375, + "learning_rate": 0.0001689134598651219, + "loss": 0.9601, "step": 970 }, { "epoch": 3.330486763450043, - "grad_norm": 0.306640625, - "learning_rate": 0.00019098124912157632, - "loss": 1.0487, + "grad_norm": 0.2314453125, + "learning_rate": 0.00016847910684500615, + "loss": 0.9652, "step": 975 }, { "epoch": 3.3475661827497865, - "grad_norm": 0.5625, - "learning_rate": 0.0001908151514958606, - "loss": 1.0591, + "grad_norm": 0.298828125, + "learning_rate": 0.0001680423073214737, + "loss": 0.9755, "step": 980 }, { "epoch": 3.3646456020495306, - "grad_norm": 0.53125, - "learning_rate": 0.00019064761187349548, - "loss": 1.0458, + "grad_norm": 0.42578125, + "learning_rate": 0.0001676030768997445, + "loss": 0.9641, "step": 985 }, { "epoch": 3.381725021349274, - "grad_norm": 0.50390625, - "learning_rate": 0.00019047863291473717, - "loss": 1.0488, + "grad_norm": 0.6875, + "learning_rate": 0.00016716143127188548, + "loss": 0.9652, "step": 990 }, { "epoch": 3.398804440649018, - "grad_norm": 0.5546875, - "learning_rate": 0.00019030821730269624, - "loss": 1.0472, + "grad_norm": 0.400390625, + "learning_rate": 0.0001667173862162499, + "loss": 0.9647, "step": 995 }, { "epoch": 3.415883859948762, - "grad_norm": 0.921875, - "learning_rate": 0.00019013636774329495, - "loss": 1.0506, + "grad_norm": 0.2890625, + "learning_rate": 0.00016627095759691362, + "loss": 0.9689, "step": 1000 }, { "epoch": 3.4329632792485056, - "grad_norm": 0.74609375, - "learning_rate": 0.00018996308696522433, - "loss": 1.0488, + "grad_norm": 0.318359375, + "learning_rate": 0.0001658221613631083, + "loss": 0.9656, "step": 1005 }, { "epoch": 3.4500426985482493, - "grad_norm": 0.60546875, - "learning_rate": 0.00018978837771990085, - "loss": 1.0425, + "grad_norm": 0.29296875, + "learning_rate": 0.0001653710135486518, + "loss": 0.9601, "step": 1010 }, { "epoch": 3.4671221178479934, - "grad_norm": 0.361328125, - "learning_rate": 0.00018961224278142268, - "loss": 1.05, + "grad_norm": 0.34765625, + "learning_rate": 0.00016491753027137498, + "loss": 0.9669, "step": 1015 }, { "epoch": 3.484201537147737, - "grad_norm": 0.29296875, - "learning_rate": 0.0001894346849465257, - "loss": 1.0406, + "grad_norm": 0.302734375, + "learning_rate": 0.00016446172773254629, + "loss": 0.9606, "step": 1020 }, { "epoch": 3.5012809564474807, - "grad_norm": 0.390625, - "learning_rate": 0.000189255707034539, - "loss": 1.0502, + "grad_norm": 0.26953125, + "learning_rate": 0.00016400362221629264, + "loss": 0.9693, "step": 1025 }, { "epoch": 3.518360375747225, - "grad_norm": 0.361328125, - "learning_rate": 0.00018907531188734026, - "loss": 1.0451, + "grad_norm": 0.32421875, + "learning_rate": 0.00016354323008901776, + "loss": 0.9631, "step": 1030 }, { "epoch": 3.5354397950469685, - "grad_norm": 0.380859375, - "learning_rate": 0.00018889350236931055, - "loss": 1.041, + "grad_norm": 0.34375, + "learning_rate": 0.0001630805677988175, + "loss": 0.9601, "step": 1035 }, { "epoch": 3.552519214346712, - "grad_norm": 0.404296875, - "learning_rate": 0.00018871028136728874, - "loss": 1.04, + "grad_norm": 0.24609375, + "learning_rate": 0.0001626156518748922, + "loss": 0.9593, "step": 1040 }, { "epoch": 3.5695986336464562, - "grad_norm": 0.466796875, - "learning_rate": 0.0001885256517905259, - "loss": 1.0432, + "grad_norm": 0.21875, + "learning_rate": 0.00016214849892695602, + "loss": 0.9611, "step": 1045 }, { "epoch": 3.5866780529462, - "grad_norm": 0.2890625, - "learning_rate": 0.00018833961657063885, - "loss": 1.0473, + "grad_norm": 0.283203125, + "learning_rate": 0.00016167912564464383, + "loss": 0.966, "step": 1050 }, { "epoch": 3.6037574722459436, - "grad_norm": 0.322265625, - "learning_rate": 0.00018815217866156387, - "loss": 1.0475, + "grad_norm": 0.267578125, + "learning_rate": 0.00016120754879691464, + "loss": 0.9651, "step": 1055 }, { "epoch": 3.6208368915456877, - "grad_norm": 0.470703125, - "learning_rate": 0.0001879633410395095, - "loss": 1.04, + "grad_norm": 0.330078125, + "learning_rate": 0.0001607337852314527, + "loss": 0.9591, "step": 1060 }, { "epoch": 3.6379163108454313, - "grad_norm": 0.359375, - "learning_rate": 0.0001877731067029096, - "loss": 1.0408, + "grad_norm": 0.27734375, + "learning_rate": 0.00016025785187406553, + "loss": 0.9578, "step": 1065 }, { "epoch": 3.654995730145175, - "grad_norm": 0.380859375, - "learning_rate": 0.00018758147867237548, - "loss": 1.0497, + "grad_norm": 0.369140625, + "learning_rate": 0.0001597797657280792, + "loss": 0.9691, "step": 1070 }, { "epoch": 3.672075149444919, - "grad_norm": 0.53125, - "learning_rate": 0.000187388459990648, - "loss": 1.0388, + "grad_norm": 0.25, + "learning_rate": 0.00015929954387373103, + "loss": 0.9591, "step": 1075 }, { "epoch": 3.6891545687446627, - "grad_norm": 0.333984375, - "learning_rate": 0.00018719405372254948, - "loss": 1.0444, + "grad_norm": 0.478515625, + "learning_rate": 0.00015881720346755905, + "loss": 0.9629, "step": 1080 }, { "epoch": 3.7062339880444064, - "grad_norm": 0.310546875, - "learning_rate": 0.00018699826295493462, - "loss": 1.0355, + "grad_norm": 0.267578125, + "learning_rate": 0.00015833276174178937, + "loss": 0.9564, "step": 1085 }, { "epoch": 3.7233134073441505, - "grad_norm": 0.328125, - "learning_rate": 0.00018680109079664188, - "loss": 1.044, + "grad_norm": 0.2578125, + "learning_rate": 0.00015784623600372042, + "loss": 0.9644, "step": 1090 }, { "epoch": 3.740392826643894, - "grad_norm": 0.490234375, - "learning_rate": 0.00018660254037844388, - "loss": 1.0379, + "grad_norm": 0.3203125, + "learning_rate": 0.0001573576436351046, + "loss": 0.9592, "step": 1095 }, { "epoch": 3.757472245943638, - "grad_norm": 0.388671875, - "learning_rate": 0.0001864026148529978, - "loss": 1.0523, + "grad_norm": 0.2890625, + "learning_rate": 0.00015686700209152738, + "loss": 0.9709, "step": 1100 }, { "epoch": 3.774551665243382, - "grad_norm": 0.291015625, - "learning_rate": 0.00018620131739479525, - "loss": 1.0454, + "grad_norm": 1.3828125, + "learning_rate": 0.00015637432890178353, + "loss": 0.9658, "step": 1105 }, { "epoch": 3.7916310845431256, - "grad_norm": 0.4140625, - "learning_rate": 0.00018599865120011192, - "loss": 1.039, + "grad_norm": 0.314453125, + "learning_rate": 0.00015587964166725095, + "loss": 0.9621, "step": 1110 }, { "epoch": 3.8087105038428692, - "grad_norm": 0.46484375, - "learning_rate": 0.0001857946194869568, - "loss": 1.0452, + "grad_norm": 0.2353515625, + "learning_rate": 0.00015538295806126205, + "loss": 0.9648, "step": 1115 }, { "epoch": 3.8257899231426133, - "grad_norm": 0.322265625, - "learning_rate": 0.00018558922549502107, - "loss": 1.0444, + "grad_norm": 0.306640625, + "learning_rate": 0.00015488429582847192, + "loss": 0.9647, "step": 1120 }, { "epoch": 3.842869342442357, - "grad_norm": 0.31640625, - "learning_rate": 0.00018538247248562674, - "loss": 1.0351, + "grad_norm": 0.27734375, + "learning_rate": 0.0001543836727842248, + "loss": 0.9569, "step": 1125 }, { "epoch": 3.8599487617421007, - "grad_norm": 0.357421875, - "learning_rate": 0.0001851743637416747, - "loss": 1.041, + "grad_norm": 0.318359375, + "learning_rate": 0.00015388110681391725, + "loss": 0.9615, "step": 1130 }, { "epoch": 3.8770281810418448, - "grad_norm": 0.42578125, - "learning_rate": 0.00018496490256759277, - "loss": 1.0364, + "grad_norm": 0.2412109375, + "learning_rate": 0.00015337661587235953, + "loss": 0.9561, "step": 1135 }, { "epoch": 3.8941076003415884, - "grad_norm": 0.4609375, - "learning_rate": 0.00018475409228928312, - "loss": 1.0476, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001528702179831338, + "loss": 0.9686, "step": 1140 }, { "epoch": 3.911187019641332, - "grad_norm": 0.3671875, - "learning_rate": 0.00018454193625406956, - "loss": 1.0376, + "grad_norm": 0.28515625, + "learning_rate": 0.00015236193123795041, + "loss": 0.959, "step": 1145 }, { "epoch": 3.928266438941076, - "grad_norm": 0.439453125, - "learning_rate": 0.00018432843783064429, - "loss": 1.0342, + "grad_norm": 0.25, + "learning_rate": 0.00015185177379600152, + "loss": 0.9545, "step": 1150 }, { "epoch": 3.94534585824082, - "grad_norm": 0.30859375, - "learning_rate": 0.0001841136004090144, - "loss": 1.0422, + "grad_norm": 0.322265625, + "learning_rate": 0.00015133976388331227, + "loss": 0.9626, "step": 1155 }, { "epoch": 3.9624252775405635, - "grad_norm": 0.310546875, - "learning_rate": 0.00018389742740044813, - "loss": 1.0393, + "grad_norm": 0.26171875, + "learning_rate": 0.00015082591979208976, + "loss": 0.9595, "step": 1160 }, { "epoch": 3.9795046968403076, - "grad_norm": 0.322265625, - "learning_rate": 0.00018367992223742067, - "loss": 1.0371, + "grad_norm": 0.24609375, + "learning_rate": 0.00015031025988006936, + "loss": 0.959, "step": 1165 }, { "epoch": 3.9965841161400513, - "grad_norm": 0.294921875, - "learning_rate": 0.00018346108837355972, - "loss": 1.0523, + "grad_norm": 0.28515625, + "learning_rate": 0.000149792802569859, + "loss": 0.9741, "step": 1170 }, { "epoch": 4.0, - "eval_loss": 2.4546658992767334, - "eval_runtime": 0.5459, - "eval_samples_per_second": 18.317, - "eval_steps_per_second": 1.832, + "eval_loss": 2.4999709129333496, + "eval_runtime": 0.5563, + "eval_samples_per_second": 17.977, + "eval_steps_per_second": 1.798, "step": 1171 }, { "epoch": 4.013663535439795, - "grad_norm": 0.48046875, - "learning_rate": 0.00018324092928359041, - "loss": 1.0323, + "grad_norm": 0.419921875, + "learning_rate": 0.00014927356634828094, + "loss": 0.943, "step": 1175 }, { "epoch": 4.030742954739539, - "grad_norm": 0.41015625, - "learning_rate": 0.00018301944846328049, - "loss": 1.0219, + "grad_norm": 0.314453125, + "learning_rate": 0.00014875256976571135, + "loss": 0.9301, "step": 1180 }, { "epoch": 4.047822374039282, - "grad_norm": 0.3984375, - "learning_rate": 0.00018279664942938447, - "loss": 1.0262, + "grad_norm": 0.25, + "learning_rate": 0.00014822983143541752, + "loss": 0.9339, "step": 1185 }, { "epoch": 4.064901793339026, - "grad_norm": 0.314453125, - "learning_rate": 0.0001825725357195881, - "loss": 1.0191, + "grad_norm": 0.3125, + "learning_rate": 0.0001477053700328929, + "loss": 0.9284, "step": 1190 }, { "epoch": 4.0819812126387705, - "grad_norm": 0.4375, - "learning_rate": 0.0001823471108924519, - "loss": 1.0331, + "grad_norm": 0.30078125, + "learning_rate": 0.00014717920429518984, + "loss": 0.9403, "step": 1195 }, { "epoch": 4.099060631938514, - "grad_norm": 0.365234375, - "learning_rate": 0.00018212037852735486, - "loss": 1.0269, + "grad_norm": 0.3203125, + "learning_rate": 0.00014665135302025035, + "loss": 0.936, "step": 1200 }, { "epoch": 4.116140051238258, - "grad_norm": 0.349609375, - "learning_rate": 0.00018189234222443763, - "loss": 1.0282, + "grad_norm": 0.306640625, + "learning_rate": 0.00014612183506623432, + "loss": 0.9361, "step": 1205 }, { "epoch": 4.133219470538002, - "grad_norm": 0.287109375, - "learning_rate": 0.0001816630056045451, - "loss": 1.027, + "grad_norm": 0.28515625, + "learning_rate": 0.00014559066935084588, + "loss": 0.9354, "step": 1210 }, { "epoch": 4.150298889837745, - "grad_norm": 0.310546875, - "learning_rate": 0.0001814323723091692, - "loss": 1.026, + "grad_norm": 0.380859375, + "learning_rate": 0.0001450578748506576, + "loss": 0.9339, "step": 1215 }, { "epoch": 4.167378309137489, - "grad_norm": 0.333984375, - "learning_rate": 0.0001812004460003909, - "loss": 1.0228, + "grad_norm": 0.248046875, + "learning_rate": 0.00014452347060043237, + "loss": 0.9319, "step": 1220 }, { "epoch": 4.184457728437233, - "grad_norm": 0.423828125, - "learning_rate": 0.00018096723036082214, - "loss": 1.0319, + "grad_norm": 0.330078125, + "learning_rate": 0.00014398747569244354, + "loss": 0.9403, "step": 1225 }, { "epoch": 4.2015371477369765, - "grad_norm": 0.333984375, - "learning_rate": 0.00018073272909354727, - "loss": 1.027, + "grad_norm": 0.2578125, + "learning_rate": 0.00014344990927579268, + "loss": 0.9368, "step": 1230 }, { "epoch": 4.218616567036721, - "grad_norm": 0.5703125, - "learning_rate": 0.0001804969459220644, - "loss": 1.0217, + "grad_norm": 0.294921875, + "learning_rate": 0.00014291079055572554, + "loss": 0.9327, "step": 1235 }, { "epoch": 4.235695986336465, - "grad_norm": 0.462890625, - "learning_rate": 0.0001802598845902262, - "loss": 1.0332, + "grad_norm": 0.24609375, + "learning_rate": 0.0001423701387929459, + "loss": 0.942, "step": 1240 }, { "epoch": 4.252775405636209, - "grad_norm": 0.421875, - "learning_rate": 0.00018002154886218033, - "loss": 1.0293, + "grad_norm": 0.296875, + "learning_rate": 0.0001418279733029274, + "loss": 0.9416, "step": 1245 }, { "epoch": 4.269854824935952, - "grad_norm": 0.31640625, - "learning_rate": 0.00017978194252230985, - "loss": 1.0259, + "grad_norm": 0.357421875, + "learning_rate": 0.0001412843134552235, + "loss": 0.9365, "step": 1250 }, { "epoch": 4.286934244235696, - "grad_norm": 0.2890625, - "learning_rate": 0.00017954106937517316, - "loss": 1.0222, + "grad_norm": 0.2421875, + "learning_rate": 0.00014073917867277557, + "loss": 0.9334, "step": 1255 }, { "epoch": 4.304013663535439, - "grad_norm": 0.271484375, - "learning_rate": 0.00017929893324544332, - "loss": 1.0259, + "grad_norm": 0.322265625, + "learning_rate": 0.00014019258843121893, + "loss": 0.9374, "step": 1260 }, { "epoch": 4.3210930828351835, - "grad_norm": 0.3046875, - "learning_rate": 0.00017905553797784759, - "loss": 1.0195, + "grad_norm": 0.34765625, + "learning_rate": 0.0001396445622581869, + "loss": 0.9309, "step": 1265 }, { "epoch": 4.3381725021349276, - "grad_norm": 0.296875, - "learning_rate": 0.0001788108874371063, - "loss": 1.0139, + "grad_norm": 0.2421875, + "learning_rate": 0.0001390951197326134, + "loss": 0.9256, "step": 1270 }, { "epoch": 4.355251921434672, - "grad_norm": 0.34375, - "learning_rate": 0.00017856498550787144, - "loss": 1.0215, + "grad_norm": 0.29296875, + "learning_rate": 0.00013854428048403324, + "loss": 0.9336, "step": 1275 }, { "epoch": 4.372331340734415, - "grad_norm": 0.345703125, - "learning_rate": 0.00017831783609466504, - "loss": 1.0332, + "grad_norm": 0.275390625, + "learning_rate": 0.00013799206419188103, + "loss": 0.9441, "step": 1280 }, { "epoch": 4.389410760034159, - "grad_norm": 0.2734375, - "learning_rate": 0.0001780694431218171, - "loss": 1.0242, + "grad_norm": 0.232421875, + "learning_rate": 0.00013743849058478808, + "loss": 0.938, "step": 1285 }, { "epoch": 4.406490179333902, - "grad_norm": 0.365234375, - "learning_rate": 0.00017781981053340337, - "loss": 1.0262, + "grad_norm": 0.3515625, + "learning_rate": 0.00013688357943987732, + "loss": 0.9389, "step": 1290 }, { "epoch": 4.423569598633646, - "grad_norm": 0.328125, - "learning_rate": 0.00017756894229318263, - "loss": 1.0323, + "grad_norm": 0.29296875, + "learning_rate": 0.00013632735058205706, + "loss": 0.945, "step": 1295 }, { "epoch": 4.44064901793339, - "grad_norm": 0.3203125, - "learning_rate": 0.00017731684238453385, - "loss": 1.0234, + "grad_norm": 0.232421875, + "learning_rate": 0.0001357698238833126, + "loss": 0.9378, "step": 1300 }, { "epoch": 4.4577284372331345, - "grad_norm": 0.341796875, - "learning_rate": 0.00017706351481039284, - "loss": 1.0224, + "grad_norm": 0.279296875, + "learning_rate": 0.00013521101926199607, + "loss": 0.9378, "step": 1305 }, { "epoch": 4.474807856532878, - "grad_norm": 0.412109375, - "learning_rate": 0.0001768089635931887, - "loss": 1.0277, + "grad_norm": 0.291015625, + "learning_rate": 0.0001346509566821153, + "loss": 0.9409, "step": 1310 }, { "epoch": 4.491887275832622, - "grad_norm": 0.3671875, - "learning_rate": 0.00017655319277478016, - "loss": 1.0228, + "grad_norm": 0.3125, + "learning_rate": 0.00013408965615262008, + "loss": 0.9363, "step": 1315 }, { "epoch": 4.508966695132365, - "grad_norm": 0.5390625, - "learning_rate": 0.00017629620641639103, - "loss": 1.028, + "grad_norm": 0.265625, + "learning_rate": 0.00013352713772668765, + "loss": 0.9413, "step": 1320 }, { "epoch": 4.526046114432109, - "grad_norm": 0.4296875, - "learning_rate": 0.000176038008598546, - "loss": 1.0412, + "grad_norm": 0.2470703125, + "learning_rate": 0.00013296342150100605, + "loss": 0.9509, "step": 1325 }, { "epoch": 4.543125533731853, - "grad_norm": 0.451171875, - "learning_rate": 0.00017577860342100579, - "loss": 1.0253, + "grad_norm": 0.24609375, + "learning_rate": 0.00013239852761505626, + "loss": 0.9361, "step": 1330 }, { "epoch": 4.560204953031597, - "grad_norm": 0.31640625, - "learning_rate": 0.00017551799500270198, - "loss": 1.0233, + "grad_norm": 0.255859375, + "learning_rate": 0.00013183247625039282, + "loss": 0.9366, "step": 1335 }, { "epoch": 4.577284372331341, - "grad_norm": 0.369140625, - "learning_rate": 0.0001752561874816717, - "loss": 1.0259, + "grad_norm": 0.24609375, + "learning_rate": 0.00013126528762992247, + "loss": 0.9381, "step": 1340 }, { "epoch": 4.594363791631085, - "grad_norm": 0.33203125, - "learning_rate": 0.00017499318501499177, - "loss": 1.0265, + "grad_norm": 0.275390625, + "learning_rate": 0.000130696982017182, + "loss": 0.9394, "step": 1345 }, { "epoch": 4.611443210930828, - "grad_norm": 0.373046875, - "learning_rate": 0.00017472899177871297, - "loss": 1.0229, + "grad_norm": 0.294921875, + "learning_rate": 0.00013012757971561415, + "loss": 0.9363, "step": 1350 }, { "epoch": 4.628522630230572, - "grad_norm": 0.373046875, - "learning_rate": 0.00017446361196779342, - "loss": 1.0194, + "grad_norm": 0.291015625, + "learning_rate": 0.00012955710106784214, + "loss": 0.9323, "step": 1355 }, { "epoch": 4.645602049530316, - "grad_norm": 0.5703125, - "learning_rate": 0.00017419704979603214, - "loss": 1.0261, + "grad_norm": 0.294921875, + "learning_rate": 0.00012898556645494325, + "loss": 0.9387, "step": 1360 }, { "epoch": 4.66268146883006, - "grad_norm": 0.458984375, - "learning_rate": 0.00017392930949600217, - "loss": 1.0226, + "grad_norm": 0.291015625, + "learning_rate": 0.00012841299629572032, + "loss": 0.935, "step": 1365 }, { "epoch": 4.679760888129803, - "grad_norm": 0.65625, - "learning_rate": 0.00017366039531898326, - "loss": 1.0319, + "grad_norm": 0.359375, + "learning_rate": 0.0001278394110459724, + "loss": 0.9446, "step": 1370 }, { "epoch": 4.6968403074295475, - "grad_norm": 0.59765625, - "learning_rate": 0.00017339031153489444, - "loss": 1.0249, + "grad_norm": 0.255859375, + "learning_rate": 0.000127264831197764, + "loss": 0.9372, "step": 1375 }, { "epoch": 4.713919726729291, - "grad_norm": 0.345703125, - "learning_rate": 0.00017311906243222614, - "loss": 1.0244, + "grad_norm": 0.275390625, + "learning_rate": 0.0001266892772786929, + "loss": 0.9363, "step": 1380 }, { "epoch": 4.730999146029035, - "grad_norm": 0.466796875, - "learning_rate": 0.00017284665231797223, - "loss": 1.0273, + "grad_norm": 0.26171875, + "learning_rate": 0.00012611276985115678, + "loss": 0.9394, "step": 1385 }, { "epoch": 4.748078565328779, - "grad_norm": 0.435546875, - "learning_rate": 0.0001725730855175615, - "loss": 1.0294, + "grad_norm": 0.33984375, + "learning_rate": 0.0001255353295116187, + "loss": 0.9438, "step": 1390 }, { "epoch": 4.765157984628523, - "grad_norm": 0.439453125, - "learning_rate": 0.00017229836637478902, - "loss": 1.0283, + "grad_norm": 0.267578125, + "learning_rate": 0.00012495697688987112, + "loss": 0.942, "step": 1395 }, { "epoch": 4.782237403928266, - "grad_norm": 0.314453125, - "learning_rate": 0.00017202249925174723, - "loss": 1.0295, + "grad_norm": 0.326171875, + "learning_rate": 0.00012437773264829897, + "loss": 0.9436, "step": 1400 }, { "epoch": 4.79931682322801, - "grad_norm": 0.2734375, - "learning_rate": 0.0001717454885287566, - "loss": 1.0252, + "grad_norm": 0.2890625, + "learning_rate": 0.0001237976174811414, + "loss": 0.9403, "step": 1405 }, { "epoch": 4.816396242527754, - "grad_norm": 0.29296875, - "learning_rate": 0.00017146733860429612, - "loss": 1.0219, + "grad_norm": 0.2578125, + "learning_rate": 0.00012321665211375256, + "loss": 0.9361, "step": 1410 }, { "epoch": 4.833475661827498, - "grad_norm": 0.34765625, - "learning_rate": 0.0001711880538949334, - "loss": 1.0245, + "grad_norm": 0.234375, + "learning_rate": 0.00012263485730186103, + "loss": 0.9404, "step": 1415 }, { "epoch": 4.850555081127242, - "grad_norm": 0.314453125, - "learning_rate": 0.0001709076388352546, - "loss": 1.0266, + "grad_norm": 0.283203125, + "learning_rate": 0.00012205225383082843, + "loss": 0.9409, "step": 1420 }, { "epoch": 4.867634500426986, - "grad_norm": 0.392578125, - "learning_rate": 0.00017062609787779403, - "loss": 1.0193, + "grad_norm": 0.2421875, + "learning_rate": 0.0001214688625149066, + "loss": 0.9351, "step": 1425 }, { "epoch": 4.884713919726729, - "grad_norm": 0.53125, - "learning_rate": 0.00017034343549296346, - "loss": 1.024, + "grad_norm": 0.2890625, + "learning_rate": 0.00012088470419649432, + "loss": 0.938, "step": 1430 }, { "epoch": 4.901793339026473, - "grad_norm": 0.412109375, - "learning_rate": 0.00017005965616898096, - "loss": 1.0272, + "grad_norm": 0.236328125, + "learning_rate": 0.00012029979974539234, + "loss": 0.9425, "step": 1435 }, { "epoch": 4.918872758326217, - "grad_norm": 0.365234375, - "learning_rate": 0.00016977476441179992, - "loss": 1.0212, + "grad_norm": 0.240234375, + "learning_rate": 0.00011971417005805818, + "loss": 0.9372, "step": 1440 }, { "epoch": 4.9359521776259605, - "grad_norm": 0.30859375, - "learning_rate": 0.00016948876474503726, - "loss": 1.0268, + "grad_norm": 0.2451171875, + "learning_rate": 0.00011912783605685913, + "loss": 0.9399, "step": 1445 }, { "epoch": 4.953031596925705, - "grad_norm": 0.3125, - "learning_rate": 0.0001692016617099018, - "loss": 1.0238, + "grad_norm": 0.251953125, + "learning_rate": 0.0001185408186893251, + "loss": 0.9385, "step": 1450 }, { "epoch": 4.970111016225449, - "grad_norm": 0.310546875, - "learning_rate": 0.0001689134598651219, - "loss": 1.0161, + "grad_norm": 0.318359375, + "learning_rate": 0.0001179531389274001, + "loss": 0.9311, "step": 1455 }, { "epoch": 4.987190435525192, - "grad_norm": 0.408203125, - "learning_rate": 0.0001686241637868734, - "loss": 1.0188, + "grad_norm": 0.2470703125, + "learning_rate": 0.00011736481776669306, + "loss": 0.9342, "step": 1460 }, { "epoch": 4.997438087105039, - "eval_loss": 2.4524176120758057, - "eval_runtime": 0.5495, - "eval_samples_per_second": 18.198, - "eval_steps_per_second": 1.82, + "eval_loss": 2.5202550888061523, + "eval_runtime": 0.5609, + "eval_samples_per_second": 17.829, + "eval_steps_per_second": 1.783, "step": 1463 }, { "epoch": 5.004269854824936, - "grad_norm": 0.55859375, - "learning_rate": 0.0001683337780687066, - "loss": 1.0219, + "grad_norm": 0.263671875, + "learning_rate": 0.00011677587622572763, + "loss": 0.9354, "step": 1465 }, { "epoch": 5.02134927412468, - "grad_norm": 0.30078125, - "learning_rate": 0.0001680423073214737, - "loss": 1.0173, + "grad_norm": 0.28515625, + "learning_rate": 0.00011618633534519141, + "loss": 0.9194, "step": 1470 }, { "epoch": 5.038428693424423, - "grad_norm": 0.376953125, - "learning_rate": 0.00016774975617325527, - "loss": 1.0036, + "grad_norm": 0.3046875, + "learning_rate": 0.00011559621618718414, + "loss": 0.9073, "step": 1475 }, { "epoch": 5.0555081127241674, - "grad_norm": 0.4453125, - "learning_rate": 0.00016745612926928694, - "loss": 1.0119, + "grad_norm": 0.30078125, + "learning_rate": 0.00011500553983446527, + "loss": 0.9146, "step": 1480 }, { "epoch": 5.0725875320239115, - "grad_norm": 0.40234375, - "learning_rate": 0.00016716143127188548, - "loss": 1.0061, + "grad_norm": 0.24609375, + "learning_rate": 0.00011441432738970072, + "loss": 0.9098, "step": 1485 }, { "epoch": 5.089666951323655, - "grad_norm": 0.396484375, - "learning_rate": 0.0001668656668603751, - "loss": 1.0114, + "grad_norm": 0.291015625, + "learning_rate": 0.00011382259997470899, + "loss": 0.9135, "step": 1490 }, { "epoch": 5.106746370623399, - "grad_norm": 0.314453125, - "learning_rate": 0.00016656884073101266, - "loss": 1.0145, + "grad_norm": 0.294921875, + "learning_rate": 0.00011323037872970657, + "loss": 0.9174, "step": 1495 }, { "epoch": 5.123825789923143, - "grad_norm": 0.287109375, - "learning_rate": 0.00016627095759691362, - "loss": 1.0101, + "grad_norm": 0.240234375, + "learning_rate": 0.00011263768481255264, + "loss": 0.9155, "step": 1500 }, { "epoch": 5.140905209222886, - "grad_norm": 0.326171875, - "learning_rate": 0.00016597202218797676, - "loss": 1.0081, + "grad_norm": 0.25, + "learning_rate": 0.00011204453939799315, + "loss": 0.9115, "step": 1505 }, { "epoch": 5.15798462852263, - "grad_norm": 0.3359375, - "learning_rate": 0.0001656720392508094, - "loss": 1.0066, + "grad_norm": 0.3046875, + "learning_rate": 0.00011145096367690444, + "loss": 0.9112, "step": 1510 }, { "epoch": 5.175064047822374, - "grad_norm": 0.390625, - "learning_rate": 0.0001653710135486518, - "loss": 1.0109, + "grad_norm": 0.26953125, + "learning_rate": 0.0001108569788555361, + "loss": 0.9142, "step": 1515 }, { "epoch": 5.192143467122118, - "grad_norm": 0.314453125, - "learning_rate": 0.00016506894986130171, - "loss": 1.007, + "grad_norm": 0.251953125, + "learning_rate": 0.00011026260615475333, + "loss": 0.9116, "step": 1520 }, { "epoch": 5.209222886421862, - "grad_norm": 0.30859375, - "learning_rate": 0.00016476585298503835, - "loss": 1.0113, + "grad_norm": 0.2490234375, + "learning_rate": 0.00010966786680927874, + "loss": 0.9141, "step": 1525 }, { "epoch": 5.226302305721606, - "grad_norm": 0.43359375, - "learning_rate": 0.00016446172773254629, - "loss": 1.0123, + "grad_norm": 0.25, + "learning_rate": 0.00010907278206693395, + "loss": 0.9168, "step": 1530 }, { "epoch": 5.243381725021349, - "grad_norm": 0.328125, - "learning_rate": 0.0001641565789328391, - "loss": 1.0168, + "grad_norm": 0.30078125, + "learning_rate": 0.00010847737318788013, + "loss": 0.9216, "step": 1535 }, { "epoch": 5.260461144321093, - "grad_norm": 0.333984375, - "learning_rate": 0.00016385041143118255, - "loss": 1.0116, + "grad_norm": 0.283203125, + "learning_rate": 0.00010788166144385888, + "loss": 0.9167, "step": 1540 }, { "epoch": 5.277540563620837, - "grad_norm": 0.38671875, - "learning_rate": 0.00016354323008901776, - "loss": 1.0098, + "grad_norm": 0.25390625, + "learning_rate": 0.0001072856681174318, + "loss": 0.9155, "step": 1545 }, { "epoch": 5.2946199829205804, - "grad_norm": 0.388671875, - "learning_rate": 0.000163235039783884, - "loss": 1.0168, + "grad_norm": 0.2734375, + "learning_rate": 0.00010668941450122055, + "loss": 0.9218, "step": 1550 }, { "epoch": 5.3116994022203246, - "grad_norm": 0.37109375, - "learning_rate": 0.00016292584540934113, - "loss": 1.007, + "grad_norm": 0.259765625, + "learning_rate": 0.00010609292189714586, + "loss": 0.9132, "step": 1555 }, { "epoch": 5.328778821520069, - "grad_norm": 0.3984375, - "learning_rate": 0.0001626156518748922, - "loss": 1.0133, + "grad_norm": 0.25, + "learning_rate": 0.0001054962116156667, + "loss": 0.9181, "step": 1560 }, { "epoch": 5.345858240819812, - "grad_norm": 0.306640625, - "learning_rate": 0.00016230446410590504, - "loss": 1.0106, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001048993049750188, + "loss": 0.9151, "step": 1565 }, { "epoch": 5.362937660119556, - "grad_norm": 0.345703125, - "learning_rate": 0.00016199228704353455, - "loss": 1.0024, + "grad_norm": 0.2490234375, + "learning_rate": 0.00010430222330045304, + "loss": 0.9096, "step": 1570 }, { "epoch": 5.3800170794193, - "grad_norm": 0.40234375, - "learning_rate": 0.00016167912564464383, - "loss": 1.0121, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001037049879234737, + "loss": 0.9183, "step": 1575 }, { "epoch": 5.397096498719043, - "grad_norm": 0.404296875, - "learning_rate": 0.00016136498488172568, - "loss": 1.0089, + "grad_norm": 0.26171875, + "learning_rate": 0.0001031076201810762, + "loss": 0.9151, "step": 1580 }, { "epoch": 5.414175918018787, - "grad_norm": 0.345703125, - "learning_rate": 0.00016104986974282363, - "loss": 1.0157, + "grad_norm": 0.2890625, + "learning_rate": 0.00010251014141498484, + "loss": 0.9205, "step": 1585 }, { "epoch": 5.4312553373185315, - "grad_norm": 0.29296875, - "learning_rate": 0.0001607337852314527, - "loss": 1.0051, + "grad_norm": 0.294921875, + "learning_rate": 0.00010191257297089052, + "loss": 0.9114, "step": 1590 }, { "epoch": 5.448334756618275, - "grad_norm": 0.390625, - "learning_rate": 0.00016041673636651996, - "loss": 1.0094, + "grad_norm": 0.2392578125, + "learning_rate": 0.00010131493619768788, + "loss": 0.9148, "step": 1595 }, { "epoch": 5.465414175918019, - "grad_norm": 0.3046875, - "learning_rate": 0.00016009872818224485, - "loss": 1.0125, + "grad_norm": 0.244140625, + "learning_rate": 0.00010071725244671282, + "loss": 0.9202, "step": 1600 }, { "epoch": 5.482493595217763, - "grad_norm": 0.28515625, - "learning_rate": 0.0001597797657280792, - "loss": 1.014, + "grad_norm": 0.259765625, + "learning_rate": 0.00010011954307097942, + "loss": 0.9216, "step": 1605 }, { "epoch": 5.499573014517506, - "grad_norm": 0.302734375, - "learning_rate": 0.00015945985406862721, - "loss": 1.0154, + "grad_norm": 0.251953125, + "learning_rate": 9.952182942441733e-05, + "loss": 0.9206, "step": 1610 }, { "epoch": 5.51665243381725, - "grad_norm": 0.365234375, - "learning_rate": 0.00015913899828356477, - "loss": 1.0122, + "grad_norm": 0.265625, + "learning_rate": 9.892413286110886e-05, + "loss": 0.9193, "step": 1615 }, { "epoch": 5.533731853116994, - "grad_norm": 0.3359375, - "learning_rate": 0.00015881720346755905, - "loss": 1.0133, + "grad_norm": 0.2431640625, + "learning_rate": 9.83264747345259e-05, + "loss": 0.9195, "step": 1620 }, { "epoch": 5.5508112724167376, - "grad_norm": 0.330078125, - "learning_rate": 0.0001584944747301874, - "loss": 1.0087, + "grad_norm": 0.28515625, + "learning_rate": 9.772887639676707e-05, + "loss": 0.9178, "step": 1625 }, { "epoch": 5.567890691716482, - "grad_norm": 0.3203125, - "learning_rate": 0.00015817081719585643, - "loss": 1.0101, + "grad_norm": 0.2373046875, + "learning_rate": 9.713135919779515e-05, + "loss": 0.9174, "step": 1630 }, { "epoch": 5.584970111016226, - "grad_norm": 0.3046875, - "learning_rate": 0.00015784623600372042, - "loss": 1.0118, + "grad_norm": 0.2890625, + "learning_rate": 9.653394448467399e-05, + "loss": 0.9194, "step": 1635 }, { "epoch": 5.602049530315969, - "grad_norm": 0.48828125, - "learning_rate": 0.00015752073630759998, - "loss": 1.0133, + "grad_norm": 0.2578125, + "learning_rate": 9.593665360080599e-05, + "loss": 0.9192, "step": 1640 }, { "epoch": 5.619128949615713, - "grad_norm": 0.37109375, - "learning_rate": 0.00015719432327589988, - "loss": 1.0089, + "grad_norm": 0.28515625, + "learning_rate": 9.533950788516974e-05, + "loss": 0.9154, "step": 1645 }, { "epoch": 5.636208368915457, - "grad_norm": 0.6171875, - "learning_rate": 0.00015686700209152738, - "loss": 1.007, - "step": 1650 + "grad_norm": 0.255859375, + "learning_rate": 9.474252867155732e-05, + "loss": 0.9142, + "step": 1650 }, { "epoch": 5.6532877882152, - "grad_norm": 0.306640625, - "learning_rate": 0.00015653877795180954, - "loss": 1.0031, + "grad_norm": 0.267578125, + "learning_rate": 9.414573728781247e-05, + "loss": 0.9101, "step": 1655 }, { "epoch": 5.6703672075149445, - "grad_norm": 0.40234375, - "learning_rate": 0.00015620965606841098, - "loss": 1.0099, + "grad_norm": 0.28125, + "learning_rate": 9.354915505506839e-05, + "loss": 0.9158, "step": 1660 }, { "epoch": 5.687446626814689, - "grad_norm": 0.41796875, - "learning_rate": 0.00015587964166725095, - "loss": 1.0127, + "grad_norm": 0.2578125, + "learning_rate": 9.295280328698604e-05, + "loss": 0.9181, "step": 1665 }, { "epoch": 5.704526046114432, - "grad_norm": 0.423828125, - "learning_rate": 0.0001555487399884206, - "loss": 1.0049, + "grad_norm": 0.306640625, + "learning_rate": 9.235670328899293e-05, + "loss": 0.9138, "step": 1670 }, { "epoch": 5.721605465414176, - "grad_norm": 0.345703125, - "learning_rate": 0.00015521695628609937, - "loss": 1.0036, + "grad_norm": 0.26171875, + "learning_rate": 9.176087635752156e-05, + "loss": 0.9119, "step": 1675 }, { "epoch": 5.73868488471392, - "grad_norm": 0.330078125, - "learning_rate": 0.00015488429582847192, - "loss": 1.012, + "grad_norm": 0.26171875, + "learning_rate": 9.116534377924883e-05, + "loss": 0.9213, "step": 1680 }, { "epoch": 5.755764304013663, - "grad_norm": 0.458984375, - "learning_rate": 0.00015455076389764443, - "loss": 1.0099, + "grad_norm": 0.251953125, + "learning_rate": 9.057012683033555e-05, + "loss": 0.9177, "step": 1685 }, { "epoch": 5.772843723313407, - "grad_norm": 0.380859375, - "learning_rate": 0.0001542163657895605, - "loss": 1.0144, + "grad_norm": 0.298828125, + "learning_rate": 8.997524677566627e-05, + "loss": 0.9217, "step": 1690 }, { "epoch": 5.789923142613151, - "grad_norm": 0.30859375, - "learning_rate": 0.00015388110681391725, - "loss": 1.0082, + "grad_norm": 0.236328125, + "learning_rate": 8.938072486808952e-05, + "loss": 0.9167, "step": 1695 }, { "epoch": 5.807002561912895, - "grad_norm": 0.328125, - "learning_rate": 0.00015354499229408114, - "loss": 1.013, + "grad_norm": 0.2392578125, + "learning_rate": 8.878658234765858e-05, + "loss": 0.9207, "step": 1700 }, { "epoch": 5.824081981212639, - "grad_norm": 0.3046875, - "learning_rate": 0.00015320802756700302, - "loss": 1.0089, + "grad_norm": 0.365234375, + "learning_rate": 8.81928404408726e-05, + "loss": 0.9173, "step": 1705 }, { "epoch": 5.841161400512383, - "grad_norm": 0.31640625, - "learning_rate": 0.0001528702179831338, - "loss": 1.0117, + "grad_norm": 0.298828125, + "learning_rate": 8.759952035991844e-05, + "loss": 0.9192, "step": 1710 }, { "epoch": 5.858240819812126, - "grad_norm": 0.33203125, - "learning_rate": 0.00015253156890633935, - "loss": 1.0087, + "grad_norm": 0.26171875, + "learning_rate": 8.70066433019125e-05, + "loss": 0.9178, "step": 1715 }, { "epoch": 5.87532023911187, - "grad_norm": 0.330078125, - "learning_rate": 0.00015219208571381525, - "loss": 1.0161, + "grad_norm": 0.2373046875, + "learning_rate": 8.641423044814374e-05, + "loss": 0.9246, "step": 1720 }, { "epoch": 5.892399658411614, - "grad_norm": 0.294921875, - "learning_rate": 0.00015185177379600152, - "loss": 1.0109, + "grad_norm": 0.255859375, + "learning_rate": 8.582230296331686e-05, + "loss": 0.9187, "step": 1725 }, { "epoch": 5.9094790777113575, - "grad_norm": 0.306640625, - "learning_rate": 0.00015151063855649698, - "loss": 1.0131, + "grad_norm": 0.240234375, + "learning_rate": 8.5230881994796e-05, + "loss": 0.9211, "step": 1730 }, { "epoch": 5.926558497011102, - "grad_norm": 0.384765625, - "learning_rate": 0.00015116868541197343, - "loss": 1.0118, + "grad_norm": 0.265625, + "learning_rate": 8.463998867184952e-05, + "loss": 0.9194, "step": 1735 }, { "epoch": 5.943637916310846, - "grad_norm": 0.322265625, - "learning_rate": 0.00015082591979208976, - "loss": 1.0126, + "grad_norm": 0.2490234375, + "learning_rate": 8.404964410489485e-05, + "loss": 0.9215, "step": 1740 }, { "epoch": 5.960717335610589, - "grad_norm": 0.365234375, - "learning_rate": 0.0001504823471394055, - "loss": 1.0065, + "grad_norm": 0.2412109375, + "learning_rate": 8.34598693847444e-05, + "loss": 0.9144, "step": 1745 }, { "epoch": 5.977796754910333, - "grad_norm": 0.353515625, - "learning_rate": 0.00015013797290929466, - "loss": 1.0095, + "grad_norm": 0.287109375, + "learning_rate": 8.287068558185225e-05, + "loss": 0.9175, "step": 1750 }, { "epoch": 5.994876174210077, - "grad_norm": 0.4609375, - "learning_rate": 0.000149792802569859, - "loss": 1.0119, + "grad_norm": 0.2314453125, + "learning_rate": 8.228211374556103e-05, + "loss": 0.9201, "step": 1755 }, { "epoch": 5.998292058070025, - "eval_loss": 2.454439640045166, - "eval_runtime": 0.5526, - "eval_samples_per_second": 18.096, - "eval_steps_per_second": 1.81, + "eval_loss": 2.5518593788146973, + "eval_runtime": 0.5597, + "eval_samples_per_second": 17.868, + "eval_steps_per_second": 1.787, "step": 1756 }, { "epoch": 6.01195559350982, - "grad_norm": 0.4765625, - "learning_rate": 0.00014944684160184108, - "loss": 1.001, + "grad_norm": 0.2490234375, + "learning_rate": 8.169417490335007e-05, + "loss": 0.9044, "step": 1760 }, { "epoch": 6.029035012809564, - "grad_norm": 0.423828125, - "learning_rate": 0.00014910009549853746, - "loss": 0.9916, + "grad_norm": 0.265625, + "learning_rate": 8.110689006008434e-05, + "loss": 0.8914, "step": 1765 }, { "epoch": 6.0461144321093085, - "grad_norm": 0.37109375, - "learning_rate": 0.00014875256976571135, - "loss": 0.9965, + "grad_norm": 0.263671875, + "learning_rate": 8.052028019726371e-05, + "loss": 0.896, "step": 1770 }, { "epoch": 6.063193851409052, - "grad_norm": 0.458984375, - "learning_rate": 0.0001484042699215052, - "loss": 1.009, + "grad_norm": 0.26953125, + "learning_rate": 7.993436627227368e-05, + "loss": 0.9072, "step": 1775 }, { "epoch": 6.080273270708796, - "grad_norm": 0.34765625, - "learning_rate": 0.00014805520149635307, - "loss": 1.001, + "grad_norm": 0.248046875, + "learning_rate": 7.934916921763628e-05, + "loss": 0.8999, "step": 1780 }, { "epoch": 6.09735269000854, - "grad_norm": 0.359375, - "learning_rate": 0.0001477053700328929, - "loss": 0.9998, + "grad_norm": 0.25, + "learning_rate": 7.876470994026254e-05, + "loss": 0.8991, "step": 1785 }, { "epoch": 6.114432109308283, - "grad_norm": 0.310546875, - "learning_rate": 0.00014735478108587828, - "loss": 1.008, + "grad_norm": 0.259765625, + "learning_rate": 7.818100932070546e-05, + "loss": 0.9065, "step": 1790 }, { "epoch": 6.131511528608027, - "grad_norm": 0.322265625, - "learning_rate": 0.0001470034402220906, - "loss": 0.9872, + "grad_norm": 0.251953125, + "learning_rate": 7.759808821241406e-05, + "loss": 0.8899, "step": 1795 }, { "epoch": 6.148590947907771, - "grad_norm": 0.33984375, - "learning_rate": 0.00014665135302025035, - "loss": 0.9968, + "grad_norm": 0.24609375, + "learning_rate": 7.701596744098818e-05, + "loss": 0.8956, "step": 1800 }, { "epoch": 6.165670367207515, - "grad_norm": 0.34375, - "learning_rate": 0.00014629852507092866, - "loss": 0.996, + "grad_norm": 0.26171875, + "learning_rate": 7.643466780343479e-05, + "loss": 0.8964, "step": 1805 }, { "epoch": 6.182749786507259, - "grad_norm": 0.333984375, - "learning_rate": 0.00014594496197645852, - "loss": 0.9986, + "grad_norm": 0.2470703125, + "learning_rate": 7.585421006742463e-05, + "loss": 0.8985, "step": 1810 }, { "epoch": 6.199829205807003, - "grad_norm": 0.30859375, - "learning_rate": 0.00014559066935084588, - "loss": 0.9966, + "grad_norm": 0.251953125, + "learning_rate": 7.527461497055061e-05, + "loss": 0.8979, "step": 1815 }, { "epoch": 6.216908625106746, - "grad_norm": 0.341796875, - "learning_rate": 0.0001452356528196804, - "loss": 1.002, + "grad_norm": 0.26953125, + "learning_rate": 7.469590321958662e-05, + "loss": 0.9014, "step": 1820 }, { "epoch": 6.23398804440649, - "grad_norm": 0.33984375, - "learning_rate": 0.00014487991802004623, - "loss": 1.0061, + "grad_norm": 0.25, + "learning_rate": 7.411809548974792e-05, + "loss": 0.9059, "step": 1825 }, { "epoch": 6.251067463706234, - "grad_norm": 0.29296875, - "learning_rate": 0.00014452347060043237, - "loss": 1.0019, + "grad_norm": 0.23828125, + "learning_rate": 7.354121242395254e-05, + "loss": 0.903, "step": 1830 }, { "epoch": 6.268146883005977, - "grad_norm": 0.404296875, - "learning_rate": 0.00014416631622064316, - "loss": 0.9955, + "grad_norm": 0.267578125, + "learning_rate": 7.296527463208358e-05, + "loss": 0.8955, "step": 1835 }, { "epoch": 6.2852263023057215, - "grad_norm": 0.94921875, - "learning_rate": 0.00014380846055170828, - "loss": 0.9979, + "grad_norm": 0.2578125, + "learning_rate": 7.239030269025311e-05, + "loss": 0.8991, "step": 1840 }, { "epoch": 6.302305721605466, - "grad_norm": 0.328125, - "learning_rate": 0.00014344990927579268, - "loss": 1.0024, + "grad_norm": 0.255859375, + "learning_rate": 7.1816317140067e-05, + "loss": 0.9014, "step": 1845 }, { "epoch": 6.319385140905209, - "grad_norm": 0.314453125, - "learning_rate": 0.00014309066808610655, - "loss": 1.0009, + "grad_norm": 0.244140625, + "learning_rate": 7.124333848789091e-05, + "loss": 0.9015, "step": 1850 }, { "epoch": 6.336464560204953, - "grad_norm": 0.486328125, - "learning_rate": 0.00014273074268681462, - "loss": 1.0039, + "grad_norm": 0.251953125, + "learning_rate": 7.067138720411795e-05, + "loss": 0.9039, "step": 1855 }, { "epoch": 6.353543979504697, - "grad_norm": 0.310546875, - "learning_rate": 0.0001423701387929459, - "loss": 0.9994, + "grad_norm": 0.263671875, + "learning_rate": 7.010048372243698e-05, + "loss": 0.8993, "step": 1860 }, { "epoch": 6.37062339880444, - "grad_norm": 0.283203125, - "learning_rate": 0.0001420088621303027, - "loss": 1.0076, + "grad_norm": 0.2734375, + "learning_rate": 6.953064843910296e-05, + "loss": 0.908, "step": 1865 }, { "epoch": 6.387702818104184, - "grad_norm": 0.373046875, - "learning_rate": 0.00014164691843536982, - "loss": 1.0011, + "grad_norm": 0.26953125, + "learning_rate": 6.8961901712208e-05, + "loss": 0.9021, "step": 1870 }, { "epoch": 6.4047822374039285, - "grad_norm": 0.345703125, - "learning_rate": 0.0001412843134552235, - "loss": 0.9996, + "grad_norm": 0.240234375, + "learning_rate": 6.839426386095425e-05, + "loss": 0.9002, "step": 1875 }, { "epoch": 6.421861656703672, - "grad_norm": 0.337890625, - "learning_rate": 0.00014092105294744, - "loss": 0.9984, + "grad_norm": 0.279296875, + "learning_rate": 6.782775516492771e-05, + "loss": 0.9007, "step": 1880 }, { "epoch": 6.438941076003416, - "grad_norm": 0.337890625, - "learning_rate": 0.00014055714268000445, - "loss": 0.9947, + "grad_norm": 0.265625, + "learning_rate": 6.726239586337408e-05, + "loss": 0.8959, "step": 1885 }, { "epoch": 6.45602049530316, - "grad_norm": 0.35546875, - "learning_rate": 0.00014019258843121893, - "loss": 1.0061, + "grad_norm": 0.2470703125, + "learning_rate": 6.669820615447522e-05, + "loss": 0.9078, "step": 1890 }, { "epoch": 6.473099914602903, - "grad_norm": 0.330078125, - "learning_rate": 0.000139827395989611, - "loss": 0.9986, + "grad_norm": 0.2490234375, + "learning_rate": 6.613520619462803e-05, + "loss": 0.8996, "step": 1895 }, { "epoch": 6.490179333902647, - "grad_norm": 0.357421875, - "learning_rate": 0.0001394615711538417, - "loss": 1.0003, + "grad_norm": 0.251953125, + "learning_rate": 6.5573416097724e-05, + "loss": 0.9023, "step": 1900 }, { "epoch": 6.507258753202391, - "grad_norm": 0.353515625, - "learning_rate": 0.0001390951197326134, - "loss": 0.9985, + "grad_norm": 0.259765625, + "learning_rate": 6.50128559344307e-05, + "loss": 0.9004, "step": 1905 }, { "epoch": 6.5243381725021345, - "grad_norm": 0.3359375, - "learning_rate": 0.00013872804754457759, - "loss": 1.0071, + "grad_norm": 0.244140625, + "learning_rate": 6.445354573147484e-05, + "loss": 0.9088, "step": 1910 }, { "epoch": 6.541417591801879, - "grad_norm": 0.32421875, - "learning_rate": 0.00013836036041824264, - "loss": 0.9921, + "grad_norm": 0.2412109375, + "learning_rate": 6.389550547092661e-05, + "loss": 0.8937, "step": 1915 }, { "epoch": 6.558497011101623, - "grad_norm": 0.306640625, - "learning_rate": 0.00013799206419188103, - "loss": 1.0062, + "grad_norm": 0.26171875, + "learning_rate": 6.333875508948593e-05, + "loss": 0.906, "step": 1920 }, { "epoch": 6.575576430401366, - "grad_norm": 0.322265625, - "learning_rate": 0.0001376231647134369, - "loss": 1.0059, + "grad_norm": 0.2431640625, + "learning_rate": 6.278331447777021e-05, + "loss": 0.9062, "step": 1925 }, { "epoch": 6.59265584970111, - "grad_norm": 0.392578125, - "learning_rate": 0.00013725366784043288, - "loss": 1.0001, + "grad_norm": 0.251953125, + "learning_rate": 6.22292034796035e-05, + "loss": 0.9004, "step": 1930 }, { "epoch": 6.609735269000854, - "grad_norm": 0.3515625, - "learning_rate": 0.00013688357943987732, - "loss": 0.9975, + "grad_norm": 0.2490234375, + "learning_rate": 6.167644189130794e-05, + "loss": 0.8995, "step": 1935 }, { "epoch": 6.626814688300598, - "grad_norm": 0.490234375, - "learning_rate": 0.00013651290538817113, - "loss": 1.0012, + "grad_norm": 0.2412109375, + "learning_rate": 6.112504946099604e-05, + "loss": 0.9011, "step": 1940 }, { "epoch": 6.6438941076003415, - "grad_norm": 0.330078125, - "learning_rate": 0.00013614165157101423, - "loss": 0.9949, + "grad_norm": 0.248046875, + "learning_rate": 6.057504588786556e-05, + "loss": 0.8957, "step": 1945 }, { "epoch": 6.660973526900086, - "grad_norm": 0.318359375, - "learning_rate": 0.0001357698238833126, - "loss": 1.0088, + "grad_norm": 0.2578125, + "learning_rate": 6.0026450821495536e-05, + "loss": 0.909, "step": 1950 }, { "epoch": 6.678052946199829, - "grad_norm": 0.267578125, - "learning_rate": 0.0001353974282290839, - "loss": 0.9965, + "grad_norm": 0.2373046875, + "learning_rate": 5.947928386114428e-05, + "loss": 0.8996, "step": 1955 }, { "epoch": 6.695132365499573, - "grad_norm": 0.31640625, - "learning_rate": 0.00013502447052136455, - "loss": 1.0063, + "grad_norm": 0.34375, + "learning_rate": 5.8933564555049105e-05, + "loss": 0.9072, "step": 1960 }, { "epoch": 6.712211784799317, - "grad_norm": 0.3515625, - "learning_rate": 0.0001346509566821153, - "loss": 0.999, + "grad_norm": 0.2578125, + "learning_rate": 5.838931239972824e-05, + "loss": 0.9022, "step": 1965 }, { "epoch": 6.729291204099061, - "grad_norm": 0.3671875, - "learning_rate": 0.00013427689264212738, - "loss": 0.9998, + "grad_norm": 0.251953125, + "learning_rate": 5.784654683928391e-05, + "loss": 0.9009, "step": 1970 }, { "epoch": 6.746370623398804, - "grad_norm": 0.57421875, - "learning_rate": 0.00013390228434092833, - "loss": 0.9977, + "grad_norm": 0.244140625, + "learning_rate": 5.730528726470792e-05, + "loss": 0.8999, "step": 1975 }, { "epoch": 6.763450042698548, - "grad_norm": 0.337890625, - "learning_rate": 0.00013352713772668765, - "loss": 0.9991, + "grad_norm": 0.2490234375, + "learning_rate": 5.6765553013188766e-05, + "loss": 0.9002, "step": 1980 }, { "epoch": 6.780529461998292, - "grad_norm": 0.326171875, - "learning_rate": 0.00013315145875612236, - "loss": 0.9939, + "grad_norm": 0.26171875, + "learning_rate": 5.622736336742087e-05, + "loss": 0.8965, "step": 1985 }, { "epoch": 6.797608881298036, - "grad_norm": 0.388671875, - "learning_rate": 0.0001327752533944025, - "loss": 0.9993, + "grad_norm": 0.314453125, + "learning_rate": 5.5690737554915604e-05, + "loss": 0.9015, "step": 1990 }, { "epoch": 6.81468830059778, - "grad_norm": 0.400390625, - "learning_rate": 0.00013239852761505626, - "loss": 1.0079, + "grad_norm": 0.244140625, + "learning_rate": 5.5155694747314504e-05, + "loss": 0.9105, "step": 1995 }, { "epoch": 6.831767719897524, - "grad_norm": 0.396484375, - "learning_rate": 0.00013202128739987532, - "loss": 0.9962, + "grad_norm": 0.244140625, + "learning_rate": 5.462225405970401e-05, + "loss": 0.8978, "step": 2000 }, { "epoch": 6.848847139197267, - "grad_norm": 0.35546875, - "learning_rate": 0.00013164353873881961, - "loss": 0.9982, + "grad_norm": 0.263671875, + "learning_rate": 5.4090434549933064e-05, + "loss": 0.8999, "step": 2005 }, { "epoch": 6.865926558497011, - "grad_norm": 0.353515625, - "learning_rate": 0.00013126528762992247, - "loss": 0.9959, + "grad_norm": 0.2734375, + "learning_rate": 5.3560255217931785e-05, + "loss": 0.8988, "step": 2010 }, { "epoch": 6.8830059777967545, - "grad_norm": 0.369140625, - "learning_rate": 0.0001308865400791953, - "loss": 1.0043, + "grad_norm": 0.2412109375, + "learning_rate": 5.303173500503289e-05, + "loss": 0.9055, "step": 2015 }, { "epoch": 6.900085397096499, - "grad_norm": 0.310546875, - "learning_rate": 0.0001305073021005321, - "loss": 0.9966, + "grad_norm": 0.248046875, + "learning_rate": 5.2504892793295e-05, + "loss": 0.8991, "step": 2020 }, { "epoch": 6.917164816396243, - "grad_norm": 0.453125, - "learning_rate": 0.00013012757971561415, - "loss": 0.9953, + "grad_norm": 0.236328125, + "learning_rate": 5.197974740482785e-05, + "loss": 0.8997, "step": 2025 }, { "epoch": 6.934244235695987, - "grad_norm": 0.435546875, - "learning_rate": 0.0001297473789538142, - "loss": 0.9957, + "grad_norm": 0.2392578125, + "learning_rate": 5.145631760112022e-05, + "loss": 0.8983, "step": 2030 }, { "epoch": 6.95132365499573, - "grad_norm": 0.453125, - "learning_rate": 0.00012936670585210103, - "loss": 1.0004, + "grad_norm": 0.232421875, + "learning_rate": 5.093462208236931e-05, + "loss": 0.9038, "step": 2035 }, { "epoch": 6.968403074295474, - "grad_norm": 0.349609375, - "learning_rate": 0.00012898556645494325, - "loss": 0.9952, + "grad_norm": 0.2451171875, + "learning_rate": 5.041467948681269e-05, + "loss": 0.8978, "step": 2040 }, { "epoch": 6.985482493595217, - "grad_norm": 0.30859375, - "learning_rate": 0.00012860396681421354, - "loss": 1.0028, + "grad_norm": 0.240234375, + "learning_rate": 4.989650839006279e-05, + "loss": 0.9054, "step": 2045 }, { "epoch": 6.999146029035013, - "eval_loss": 2.4654781818389893, - "eval_runtime": 0.5548, - "eval_samples_per_second": 18.024, - "eval_steps_per_second": 1.802, + "eval_loss": 2.5763192176818848, + "eval_runtime": 0.5559, + "eval_samples_per_second": 17.989, + "eval_steps_per_second": 1.799, "step": 2049 }, { "epoch": 7.002561912894961, - "grad_norm": 0.34765625, - "learning_rate": 0.0001282219129890925, - "loss": 0.9919, + "grad_norm": 0.2431640625, + "learning_rate": 4.9380127304442634e-05, + "loss": 0.8953, "step": 2050 }, { "epoch": 7.0196413321947055, - "grad_norm": 0.2890625, - "learning_rate": 0.0001278394110459724, - "loss": 0.9953, + "grad_norm": 0.24609375, + "learning_rate": 4.886555467832512e-05, + "loss": 0.893, "step": 2055 }, { "epoch": 7.036720751494449, - "grad_norm": 0.3359375, - "learning_rate": 0.00012745646705836097, - "loss": 0.9879, + "grad_norm": 0.2451171875, + "learning_rate": 4.835280889547351e-05, + "loss": 0.8885, "step": 2060 }, { "epoch": 7.053800170794193, - "grad_norm": 0.37109375, - "learning_rate": 0.00012707308710678477, - "loss": 0.9939, + "grad_norm": 0.251953125, + "learning_rate": 4.7841908274384616e-05, + "loss": 0.8916, "step": 2065 }, { "epoch": 7.070879590093937, - "grad_norm": 0.37890625, - "learning_rate": 0.0001266892772786929, - "loss": 0.9936, + "grad_norm": 0.2421875, + "learning_rate": 4.733287106763481e-05, + "loss": 0.8906, "step": 2070 }, { "epoch": 7.08795900939368, - "grad_norm": 0.373046875, - "learning_rate": 0.00012630504366836008, - "loss": 0.987, + "grad_norm": 0.2451171875, + "learning_rate": 4.6825715461227284e-05, + "loss": 0.8876, "step": 2075 }, { "epoch": 7.105038428693424, - "grad_norm": 0.3046875, - "learning_rate": 0.0001259203923767901, - "loss": 0.9919, + "grad_norm": 0.2470703125, + "learning_rate": 4.6320459573942856e-05, + "loss": 0.8908, "step": 2080 }, { "epoch": 7.122117847993168, - "grad_norm": 0.318359375, - "learning_rate": 0.0001255353295116187, - "loss": 0.9909, + "grad_norm": 0.240234375, + "learning_rate": 4.581712145669239e-05, + "loss": 0.8887, "step": 2085 }, { "epoch": 7.1391972672929125, - "grad_norm": 0.31640625, - "learning_rate": 0.00012514986118701695, - "loss": 0.9868, + "grad_norm": 0.2451171875, + "learning_rate": 4.531571909187197e-05, + "loss": 0.886, "step": 2090 }, { "epoch": 7.156276686592656, - "grad_norm": 0.27734375, - "learning_rate": 0.00012476399352359376, - "loss": 0.9881, + "grad_norm": 0.2431640625, + "learning_rate": 4.481627039272056e-05, + "loss": 0.8883, "step": 2095 }, { "epoch": 7.1733561058924, - "grad_norm": 0.3515625, - "learning_rate": 0.00012437773264829897, - "loss": 0.9939, + "grad_norm": 0.25390625, + "learning_rate": 4.431879320267972e-05, + "loss": 0.8922, "step": 2100 }, { "epoch": 7.190435525192143, - "grad_norm": 0.361328125, - "learning_rate": 0.00012399108469432601, - "loss": 0.9834, + "grad_norm": 0.244140625, + "learning_rate": 4.38233052947565e-05, + "loss": 0.8825, "step": 2105 }, { "epoch": 7.207514944491887, - "grad_norm": 0.279296875, - "learning_rate": 0.00012360405580101448, - "loss": 0.9847, + "grad_norm": 0.2353515625, + "learning_rate": 4.332982437088825e-05, + "loss": 0.8856, "step": 2110 }, { "epoch": 7.224594363791631, - "grad_norm": 0.310546875, - "learning_rate": 0.00012321665211375256, - "loss": 0.9945, + "grad_norm": 0.248046875, + "learning_rate": 4.2838368061310276e-05, + "loss": 0.8929, "step": 2115 }, { "epoch": 7.241673783091375, - "grad_norm": 0.306640625, - "learning_rate": 0.00012282887978387976, - "loss": 0.999, + "grad_norm": 0.2431640625, + "learning_rate": 4.2348953923925916e-05, + "loss": 0.8977, "step": 2120 }, { "epoch": 7.2587532023911185, - "grad_norm": 0.310546875, - "learning_rate": 0.00012244074496858888, - "loss": 0.9854, + "grad_norm": 0.2392578125, + "learning_rate": 4.186159944367936e-05, + "loss": 0.8855, "step": 2125 }, { "epoch": 7.275832621690863, - "grad_norm": 0.3125, - "learning_rate": 0.00012205225383082843, - "loss": 0.9842, + "grad_norm": 0.2421875, + "learning_rate": 4.137632203193086e-05, + "loss": 0.8837, "step": 2130 }, { "epoch": 7.292912040990606, - "grad_norm": 0.353515625, - "learning_rate": 0.00012166341253920472, - "loss": 0.9928, + "grad_norm": 0.251953125, + "learning_rate": 4.0893139025834806e-05, + "loss": 0.8927, "step": 2135 }, { "epoch": 7.30999146029035, - "grad_norm": 0.318359375, - "learning_rate": 0.000121274227267884, - "loss": 0.9915, + "grad_norm": 0.25, + "learning_rate": 4.041206768772022e-05, + "loss": 0.8902, "step": 2140 }, { "epoch": 7.327070879590094, - "grad_norm": 0.328125, - "learning_rate": 0.00012088470419649432, - "loss": 0.992, + "grad_norm": 0.2451171875, + "learning_rate": 3.993312520447414e-05, + "loss": 0.8904, "step": 2145 }, { "epoch": 7.344150298889838, - "grad_norm": 0.56640625, - "learning_rate": 0.00012049484951002739, - "loss": 0.9897, + "grad_norm": 0.2451171875, + "learning_rate": 3.9456328686927525e-05, + "loss": 0.8885, "step": 2150 }, { "epoch": 7.361229718189581, - "grad_norm": 0.294921875, - "learning_rate": 0.00012010466939874053, - "loss": 0.9968, + "grad_norm": 0.2451171875, + "learning_rate": 3.898169516924398e-05, + "loss": 0.8945, "step": 2155 }, { "epoch": 7.3783091374893255, - "grad_norm": 0.294921875, - "learning_rate": 0.00011971417005805818, - "loss": 0.9933, + "grad_norm": 0.240234375, + "learning_rate": 3.850924160831115e-05, + "loss": 0.892, "step": 2160 }, { "epoch": 7.395388556789069, - "grad_norm": 0.279296875, - "learning_rate": 0.00011932335768847371, - "loss": 0.9953, + "grad_norm": 0.2451171875, + "learning_rate": 3.803898488313501e-05, + "loss": 0.8933, "step": 2165 }, { "epoch": 7.412467976088813, - "grad_norm": 0.330078125, - "learning_rate": 0.00011893223849545084, - "loss": 0.9918, + "grad_norm": 0.244140625, + "learning_rate": 3.757094179423672e-05, + "loss": 0.892, "step": 2170 }, { "epoch": 7.429547395388557, - "grad_norm": 0.40234375, - "learning_rate": 0.0001185408186893251, - "loss": 0.9896, + "grad_norm": 0.2392578125, + "learning_rate": 3.710512906305248e-05, + "loss": 0.8905, "step": 2175 }, { "epoch": 7.446626814688301, - "grad_norm": 0.35546875, - "learning_rate": 0.00011814910448520536, - "loss": 0.9906, + "grad_norm": 0.2373046875, + "learning_rate": 3.6641563331336125e-05, + "loss": 0.888, "step": 2180 }, { "epoch": 7.463706233988044, - "grad_norm": 0.376953125, - "learning_rate": 0.00011775710210287492, - "loss": 0.9844, + "grad_norm": 0.244140625, + "learning_rate": 3.618026116056456e-05, + "loss": 0.8847, "step": 2185 }, { "epoch": 7.480785653287788, - "grad_norm": 0.31640625, - "learning_rate": 0.00011736481776669306, - "loss": 0.99, + "grad_norm": 0.26171875, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.8867, "step": 2190 }, { "epoch": 7.497865072587532, - "grad_norm": 0.361328125, - "learning_rate": 0.00011697225770549585, - "loss": 0.9899, + "grad_norm": 0.251953125, + "learning_rate": 3.5264513342831615e-05, + "loss": 0.8894, "step": 2195 }, { "epoch": 7.514944491887276, - "grad_norm": 0.296875, - "learning_rate": 0.00011657942815249754, - "loss": 0.9888, + "grad_norm": 0.2431640625, + "learning_rate": 3.4810100412128747e-05, + "loss": 0.8894, "step": 2200 }, { "epoch": 7.53202391118702, - "grad_norm": 0.37109375, - "learning_rate": 0.00011618633534519141, - "loss": 0.9935, + "grad_norm": 0.248046875, + "learning_rate": 3.435801647371897e-05, + "loss": 0.8922, "step": 2205 }, { "epoch": 7.549103330486764, - "grad_norm": 0.33203125, - "learning_rate": 0.00011579298552525084, - "loss": 0.9955, + "grad_norm": 0.25, + "learning_rate": 3.3908277678877445e-05, + "loss": 0.8934, "step": 2210 }, { "epoch": 7.566182749786507, - "grad_norm": 0.310546875, - "learning_rate": 0.0001153993849384301, - "loss": 0.9884, + "grad_norm": 0.240234375, + "learning_rate": 3.346090009509613e-05, + "loss": 0.8865, "step": 2215 }, { "epoch": 7.583262169086251, - "grad_norm": 0.3046875, - "learning_rate": 0.00011500553983446527, - "loss": 0.9895, + "grad_norm": 0.244140625, + "learning_rate": 3.3015899705509734e-05, + "loss": 0.8889, "step": 2220 }, { "epoch": 7.600341588385994, - "grad_norm": 0.306640625, - "learning_rate": 0.00011461145646697495, - "loss": 0.9874, + "grad_norm": 0.248046875, + "learning_rate": 3.257329240832454e-05, + "loss": 0.886, "step": 2225 }, { "epoch": 7.6174210076857385, - "grad_norm": 0.28125, - "learning_rate": 0.00011421714109336097, - "loss": 0.9885, + "grad_norm": 0.244140625, + "learning_rate": 3.21330940162508e-05, + "loss": 0.8875, "step": 2230 }, { "epoch": 7.634500426985483, - "grad_norm": 0.279296875, - "learning_rate": 0.00011382259997470899, - "loss": 0.9876, + "grad_norm": 0.2392578125, + "learning_rate": 3.169532025593729e-05, + "loss": 0.8863, "step": 2235 }, { "epoch": 7.651579846285227, - "grad_norm": 0.31640625, - "learning_rate": 0.00011342783937568926, - "loss": 0.9948, + "grad_norm": 0.2490234375, + "learning_rate": 3.125998676740987e-05, + "loss": 0.8945, "step": 2240 }, { "epoch": 7.66865926558497, - "grad_norm": 0.296875, - "learning_rate": 0.00011303286556445694, - "loss": 0.9883, + "grad_norm": 0.248046875, + "learning_rate": 3.0827109103512643e-05, + "loss": 0.888, "step": 2245 }, { "epoch": 7.685738684884714, - "grad_norm": 0.328125, - "learning_rate": 0.00011263768481255264, - "loss": 0.996, + "grad_norm": 0.2470703125, + "learning_rate": 3.0396702729352023e-05, + "loss": 0.895, "step": 2250 }, { "epoch": 7.702818104184458, - "grad_norm": 0.349609375, - "learning_rate": 0.00011224230339480284, - "loss": 0.9896, + "grad_norm": 0.2470703125, + "learning_rate": 2.996878302174472e-05, + "loss": 0.89, "step": 2255 }, { "epoch": 7.719897523484201, - "grad_norm": 0.384765625, - "learning_rate": 0.00011184672758922034, - "loss": 0.9856, + "grad_norm": 0.26953125, + "learning_rate": 2.9543365268667867e-05, + "loss": 0.8868, "step": 2260 }, { "epoch": 7.736976942783945, - "grad_norm": 0.326171875, - "learning_rate": 0.00011145096367690444, - "loss": 0.9968, + "grad_norm": 0.26171875, + "learning_rate": 2.9120464668713188e-05, + "loss": 0.8944, "step": 2265 }, { "epoch": 7.7540563620836895, - "grad_norm": 0.322265625, - "learning_rate": 0.00011105501794194131, - "loss": 0.9938, + "grad_norm": 0.2392578125, + "learning_rate": 2.8700096330544012e-05, + "loss": 0.8946, "step": 2270 }, { "epoch": 7.771135781383433, - "grad_norm": 0.326171875, - "learning_rate": 0.00011065889667130414, - "loss": 0.9924, + "grad_norm": 0.267578125, + "learning_rate": 2.828227527235513e-05, + "loss": 0.8926, "step": 2275 }, { "epoch": 7.788215200683177, - "grad_norm": 0.32421875, - "learning_rate": 0.00011026260615475333, - "loss": 1.0, + "grad_norm": 0.2578125, + "learning_rate": 2.7867016421336776e-05, + "loss": 0.8984, "step": 2280 }, { "epoch": 7.805294619982921, - "grad_norm": 0.419921875, - "learning_rate": 0.00010986615268473661, - "loss": 0.9882, + "grad_norm": 0.26171875, + "learning_rate": 2.7454334613140864e-05, + "loss": 0.8874, "step": 2285 }, { "epoch": 7.822374039282664, - "grad_norm": 0.279296875, - "learning_rate": 0.00010946954255628928, - "loss": 0.9926, + "grad_norm": 0.2431640625, + "learning_rate": 2.7044244591351232e-05, + "loss": 0.892, "step": 2290 }, { "epoch": 7.839453458582408, - "grad_norm": 0.32421875, - "learning_rate": 0.00010907278206693395, - "loss": 0.9936, + "grad_norm": 0.248046875, + "learning_rate": 2.6636761006956955e-05, + "loss": 0.8936, "step": 2295 }, { "epoch": 7.856532877882152, - "grad_norm": 0.345703125, - "learning_rate": 0.00010867587751658079, - "loss": 0.9842, + "grad_norm": 0.24609375, + "learning_rate": 2.6231898417828603e-05, + "loss": 0.8856, "step": 2300 }, { "epoch": 7.873612297181896, - "grad_norm": 0.306640625, - "learning_rate": 0.00010827883520742741, - "loss": 0.9875, + "grad_norm": 0.25, + "learning_rate": 2.582967128819851e-05, + "loss": 0.8886, "step": 2305 }, { "epoch": 7.89069171648164, - "grad_norm": 0.359375, - "learning_rate": 0.00010788166144385888, - "loss": 0.9893, + "grad_norm": 0.2353515625, + "learning_rate": 2.5430093988143778e-05, + "loss": 0.8891, "step": 2310 }, { "epoch": 7.907771135781384, - "grad_norm": 0.50390625, - "learning_rate": 0.00010748436253234742, - "loss": 0.9815, + "grad_norm": 0.2421875, + "learning_rate": 2.5033180793072986e-05, + "loss": 0.8808, "step": 2315 }, { "epoch": 7.924850555081127, - "grad_norm": 0.373046875, - "learning_rate": 0.0001070869447813525, - "loss": 0.9863, + "grad_norm": 0.2392578125, + "learning_rate": 2.4638945883216235e-05, + "loss": 0.8868, "step": 2320 }, { "epoch": 7.941929974380871, - "grad_norm": 0.353515625, - "learning_rate": 0.00010668941450122055, - "loss": 0.9943, + "grad_norm": 0.2470703125, + "learning_rate": 2.4247403343118335e-05, + "loss": 0.8934, "step": 2325 }, { "epoch": 7.959009393680615, - "grad_norm": 0.3984375, - "learning_rate": 0.0001062917780040847, - "loss": 0.9886, + "grad_norm": 0.2392578125, + "learning_rate": 2.385856716113587e-05, + "loss": 0.8878, "step": 2330 }, { "epoch": 7.976088812980358, - "grad_norm": 0.28515625, - "learning_rate": 0.00010589404160376473, - "loss": 0.9928, + "grad_norm": 0.2421875, + "learning_rate": 2.3472451228937253e-05, + "loss": 0.8913, "step": 2335 }, { "epoch": 7.9931682322801025, - "grad_norm": 0.3046875, - "learning_rate": 0.0001054962116156667, - "loss": 0.9914, + "grad_norm": 0.2373046875, + "learning_rate": 2.3089069341006565e-05, + "loss": 0.8902, "step": 2340 }, { "epoch": 8.0, - "eval_loss": 2.468475341796875, - "eval_runtime": 0.5517, - "eval_samples_per_second": 18.125, - "eval_steps_per_second": 1.812, + "eval_loss": 2.592200994491577, + "eval_runtime": 0.5427, + "eval_samples_per_second": 18.425, + "eval_steps_per_second": 1.843, "step": 2342 }, { "epoch": 8.010247651579846, - "grad_norm": 0.359375, - "learning_rate": 0.00010509829435668265, - "loss": 0.9939, + "grad_norm": 0.236328125, + "learning_rate": 2.2708435194150634e-05, + "loss": 0.8945, "step": 2345 }, { "epoch": 8.02732707087959, - "grad_norm": 0.291015625, - "learning_rate": 0.00010470029614509041, - "loss": 0.9808, + "grad_norm": 0.2392578125, + "learning_rate": 2.2330562387009745e-05, + "loss": 0.8833, "step": 2350 }, { "epoch": 8.044406490179334, - "grad_norm": 0.30859375, - "learning_rate": 0.00010430222330045304, - "loss": 0.9807, + "grad_norm": 0.2412109375, + "learning_rate": 2.1955464419571782e-05, + "loss": 0.8823, "step": 2355 }, { "epoch": 8.061485909479078, - "grad_norm": 0.30859375, - "learning_rate": 0.00010390408214351892, - "loss": 0.9852, + "grad_norm": 0.2373046875, + "learning_rate": 2.1583154692689976e-05, + "loss": 0.8874, "step": 2360 }, { "epoch": 8.078565328778822, - "grad_norm": 0.33203125, - "learning_rate": 0.00010350587899612088, - "loss": 0.9704, + "grad_norm": 0.2470703125, + "learning_rate": 2.121364650760408e-05, + "loss": 0.8743, "step": 2365 }, { "epoch": 8.095644748078564, - "grad_norm": 0.37109375, - "learning_rate": 0.0001031076201810762, - "loss": 0.9861, + "grad_norm": 0.251953125, + "learning_rate": 2.08469530654652e-05, + "loss": 0.8872, "step": 2370 }, { "epoch": 8.112724167378309, - "grad_norm": 0.30859375, - "learning_rate": 0.00010270931202208595, - "loss": 0.9933, + "grad_norm": 0.2470703125, + "learning_rate": 2.048308746686417e-05, + "loss": 0.8936, "step": 2375 }, { "epoch": 8.129803586678053, - "grad_norm": 0.361328125, - "learning_rate": 0.00010231096084363483, - "loss": 0.9808, + "grad_norm": 0.236328125, + "learning_rate": 2.0122062711363532e-05, + "loss": 0.8818, "step": 2380 }, { "epoch": 8.146883005977797, - "grad_norm": 0.3671875, - "learning_rate": 0.00010191257297089052, - "loss": 0.9859, + "grad_norm": 0.25, + "learning_rate": 1.9763891697032978e-05, + "loss": 0.887, "step": 2385 }, { "epoch": 8.163962425277541, - "grad_norm": 0.28515625, - "learning_rate": 0.00010151415472960342, - "loss": 0.9839, + "grad_norm": 0.248046875, + "learning_rate": 1.9408587219988805e-05, + "loss": 0.884, "step": 2390 }, { "epoch": 8.181041844577285, - "grad_norm": 0.451171875, - "learning_rate": 0.00010111571244600606, - "loss": 0.9881, + "grad_norm": 0.2421875, + "learning_rate": 1.9056161973936513e-05, + "loss": 0.8892, "step": 2395 }, { "epoch": 8.198121263877027, - "grad_norm": 0.443359375, - "learning_rate": 0.00010071725244671282, - "loss": 0.9869, + "grad_norm": 0.240234375, + "learning_rate": 1.8706628549717452e-05, + "loss": 0.8883, "step": 2400 }, { "epoch": 8.215200683176771, - "grad_norm": 0.400390625, - "learning_rate": 0.00010031878105861923, - "loss": 0.979, + "grad_norm": 0.26953125, + "learning_rate": 1.835999943485892e-05, + "loss": 0.8802, "step": 2405 }, { "epoch": 8.232280102476516, - "grad_norm": 0.330078125, - "learning_rate": 9.992030460880181e-05, - "loss": 0.9873, + "grad_norm": 0.2451171875, + "learning_rate": 1.8016287013128018e-05, + "loss": 0.8886, "step": 2410 }, { "epoch": 8.24935952177626, - "grad_norm": 0.302734375, - "learning_rate": 9.952182942441733e-05, - "loss": 0.978, + "grad_norm": 0.2392578125, + "learning_rate": 1.767550356408938e-05, + "loss": 0.8784, "step": 2415 }, { "epoch": 8.266438941076004, - "grad_norm": 0.31640625, - "learning_rate": 9.91233618326026e-05, - "loss": 0.9911, + "grad_norm": 0.24609375, + "learning_rate": 1.7337661262666294e-05, + "loss": 0.8897, "step": 2420 }, { "epoch": 8.283518360375748, - "grad_norm": 0.365234375, - "learning_rate": 9.872490816037372e-05, - "loss": 0.9824, + "grad_norm": 0.2412109375, + "learning_rate": 1.7002772178705716e-05, + "loss": 0.8844, "step": 2425 }, { "epoch": 8.30059777967549, - "grad_norm": 0.322265625, - "learning_rate": 9.83264747345259e-05, - "loss": 0.9845, + "grad_norm": 0.2451171875, + "learning_rate": 1.6670848276547334e-05, + "loss": 0.8856, "step": 2430 }, { "epoch": 8.317677198975234, - "grad_norm": 0.34375, - "learning_rate": 9.792806788153271e-05, - "loss": 0.9752, + "grad_norm": 0.2421875, + "learning_rate": 1.6341901414595705e-05, + "loss": 0.8762, "step": 2435 }, { "epoch": 8.334756618274978, - "grad_norm": 0.322265625, - "learning_rate": 9.752969392744606e-05, - "loss": 0.9802, + "grad_norm": 0.2412109375, + "learning_rate": 1.601594334489702e-05, + "loss": 0.8802, "step": 2440 }, { "epoch": 8.351836037574722, - "grad_norm": 0.359375, - "learning_rate": 9.713135919779515e-05, - "loss": 0.9945, + "grad_norm": 0.2451171875, + "learning_rate": 1.5692985712719e-05, + "loss": 0.8939, "step": 2445 }, { "epoch": 8.368915456874467, - "grad_norm": 0.38671875, - "learning_rate": 9.673307001748661e-05, - "loss": 0.9825, + "grad_norm": 0.23828125, + "learning_rate": 1.5373040056134814e-05, + "loss": 0.8804, "step": 2450 }, { "epoch": 8.38599487617421, - "grad_norm": 0.349609375, - "learning_rate": 9.633483271070366e-05, - "loss": 0.9803, + "grad_norm": 0.240234375, + "learning_rate": 1.5056117805611115e-05, + "loss": 0.8806, "step": 2455 }, { "epoch": 8.403074295473953, - "grad_norm": 0.32421875, - "learning_rate": 9.593665360080599e-05, - "loss": 0.9841, + "grad_norm": 0.24609375, + "learning_rate": 1.474223028359939e-05, + "loss": 0.8856, "step": 2460 }, { "epoch": 8.420153714773697, - "grad_norm": 0.380859375, - "learning_rate": 9.553853901022913e-05, - "loss": 0.9777, + "grad_norm": 0.2470703125, + "learning_rate": 1.4431388704131632e-05, + "loss": 0.8791, "step": 2465 }, { "epoch": 8.437233134073441, - "grad_norm": 0.388671875, - "learning_rate": 9.514049526038418e-05, - "loss": 0.9871, + "grad_norm": 0.236328125, + "learning_rate": 1.4123604172419713e-05, + "loss": 0.8874, "step": 2470 }, { "epoch": 8.454312553373185, - "grad_norm": 0.328125, - "learning_rate": 9.474252867155732e-05, - "loss": 0.9821, + "grad_norm": 0.2412109375, + "learning_rate": 1.3818887684458426e-05, + "loss": 0.8827, "step": 2475 }, { "epoch": 8.47139197267293, - "grad_norm": 0.287109375, - "learning_rate": 9.43446455628097e-05, - "loss": 0.983, + "grad_norm": 0.24609375, + "learning_rate": 1.3517250126632986e-05, + "loss": 0.8847, "step": 2480 }, { "epoch": 8.488471391972674, - "grad_norm": 0.318359375, - "learning_rate": 9.394685225187683e-05, - "loss": 0.9807, + "grad_norm": 0.2470703125, + "learning_rate": 1.321870227532971e-05, + "loss": 0.8826, "step": 2485 }, { "epoch": 8.505550811272418, - "grad_norm": 0.330078125, - "learning_rate": 9.354915505506839e-05, - "loss": 0.991, + "grad_norm": 0.24609375, + "learning_rate": 1.292325479655131e-05, + "loss": 0.89, "step": 2490 }, { "epoch": 8.52263023057216, - "grad_norm": 0.3046875, - "learning_rate": 9.31515602871679e-05, - "loss": 0.9914, + "grad_norm": 0.234375, + "learning_rate": 1.263091824553574e-05, + "loss": 0.8904, "step": 2495 }, { "epoch": 8.539709649871904, - "grad_norm": 0.3125, - "learning_rate": 9.27540742613326e-05, - "loss": 0.9886, + "grad_norm": 0.2392578125, + "learning_rate": 1.2341703066379074e-05, + "loss": 0.888, "step": 2500 }, { "epoch": 8.556789069171648, - "grad_norm": 0.296875, - "learning_rate": 9.235670328899293e-05, - "loss": 0.9841, + "grad_norm": 0.236328125, + "learning_rate": 1.205561959166237e-05, + "loss": 0.8841, "step": 2505 }, { "epoch": 8.573868488471392, - "grad_norm": 0.341796875, - "learning_rate": 9.195945367975256e-05, - "loss": 0.9859, + "grad_norm": 0.2412109375, + "learning_rate": 1.1772678042082607e-05, + "loss": 0.8856, "step": 2510 }, { "epoch": 8.590947907771136, - "grad_norm": 0.298828125, - "learning_rate": 9.156233174128805e-05, - "loss": 0.9863, + "grad_norm": 0.2431640625, + "learning_rate": 1.149288852608743e-05, + "loss": 0.8871, "step": 2515 }, { "epoch": 8.608027327070879, - "grad_norm": 0.3203125, - "learning_rate": 9.116534377924883e-05, - "loss": 0.9789, + "grad_norm": 0.2412109375, + "learning_rate": 1.1216261039514087e-05, + "loss": 0.8817, "step": 2520 }, { "epoch": 8.625106746370623, - "grad_norm": 0.283203125, - "learning_rate": 9.076849609715693e-05, - "loss": 0.983, + "grad_norm": 0.244140625, + "learning_rate": 1.094280546523231e-05, + "loss": 0.8825, "step": 2525 }, { "epoch": 8.642186165670367, - "grad_norm": 0.310546875, - "learning_rate": 9.037179499630703e-05, - "loss": 0.9922, + "grad_norm": 0.23828125, + "learning_rate": 1.0672531572791178e-05, + "loss": 0.8922, "step": 2530 }, { "epoch": 8.659265584970111, - "grad_norm": 0.310546875, - "learning_rate": 8.997524677566627e-05, - "loss": 0.9863, + "grad_norm": 0.2421875, + "learning_rate": 1.0405449018070168e-05, + "loss": 0.8879, "step": 2535 }, { "epoch": 8.676345004269855, - "grad_norm": 0.2890625, - "learning_rate": 8.957885773177438e-05, - "loss": 0.9845, + "grad_norm": 0.24609375, + "learning_rate": 1.0141567342934132e-05, + "loss": 0.885, "step": 2540 }, { "epoch": 8.6934244235696, - "grad_norm": 0.291015625, - "learning_rate": 8.918263415864354e-05, - "loss": 0.9893, + "grad_norm": 0.25390625, + "learning_rate": 9.880895974892412e-06, + "loss": 0.8886, "step": 2545 }, { "epoch": 8.710503842869343, - "grad_norm": 0.33984375, - "learning_rate": 8.878658234765858e-05, - "loss": 0.9812, + "grad_norm": 0.2431640625, + "learning_rate": 9.623444226762035e-06, + "loss": 0.8805, "step": 2550 }, { "epoch": 8.727583262169086, - "grad_norm": 0.283203125, - "learning_rate": 8.839070858747697e-05, - "loss": 0.9855, + "grad_norm": 0.23828125, + "learning_rate": 9.369221296335006e-06, + "loss": 0.8866, "step": 2555 }, { "epoch": 8.74466268146883, - "grad_norm": 0.310546875, - "learning_rate": 8.799501916392912e-05, - "loss": 0.9797, + "grad_norm": 0.2353515625, + "learning_rate": 9.118236266049707e-06, + "loss": 0.8811, "step": 2560 }, { "epoch": 8.761742100768574, - "grad_norm": 0.333984375, - "learning_rate": 8.759952035991844e-05, - "loss": 0.9836, + "grad_norm": 0.248046875, + "learning_rate": 8.870498102666402e-06, + "loss": 0.8849, "step": 2565 }, { "epoch": 8.778821520068318, - "grad_norm": 0.29296875, - "learning_rate": 8.720421845532151e-05, - "loss": 0.9848, + "grad_norm": 0.2353515625, + "learning_rate": 8.626015656946895e-06, + "loss": 0.8857, "step": 2570 }, { "epoch": 8.795900939368062, - "grad_norm": 0.306640625, - "learning_rate": 8.680911972688855e-05, - "loss": 0.9837, + "grad_norm": 0.2373046875, + "learning_rate": 8.384797663338306e-06, + "loss": 0.8833, "step": 2575 }, { "epoch": 8.812980358667804, - "grad_norm": 0.283203125, - "learning_rate": 8.641423044814374e-05, - "loss": 0.9823, + "grad_norm": 0.2392578125, + "learning_rate": 8.146852739661105e-06, + "loss": 0.885, "step": 2580 }, { "epoch": 8.830059777967548, - "grad_norm": 0.310546875, - "learning_rate": 8.601955688928545e-05, - "loss": 0.9852, + "grad_norm": 0.23828125, + "learning_rate": 7.91218938680104e-06, + "loss": 0.8861, "step": 2585 }, { "epoch": 8.847139197267293, - "grad_norm": 0.302734375, - "learning_rate": 8.562510531708677e-05, - "loss": 0.9804, + "grad_norm": 0.24609375, + "learning_rate": 7.6808159884057e-06, + "loss": 0.88, "step": 2590 }, { "epoch": 8.864218616567037, - "grad_norm": 0.291015625, - "learning_rate": 8.5230881994796e-05, - "loss": 0.9783, + "grad_norm": 0.2451171875, + "learning_rate": 7.45274081058478e-06, + "loss": 0.8794, "step": 2595 }, { "epoch": 8.88129803586678, - "grad_norm": 0.357421875, - "learning_rate": 8.48368931820373e-05, - "loss": 0.9783, + "grad_norm": 0.236328125, + "learning_rate": 7.2279720016148244e-06, + "loss": 0.8801, "step": 2600 }, { "epoch": 8.898377455166525, - "grad_norm": 0.322265625, - "learning_rate": 8.444314513471107e-05, - "loss": 0.9813, + "grad_norm": 0.2373046875, + "learning_rate": 7.0065175916482095e-06, + "loss": 0.8818, "step": 2605 }, { "epoch": 8.915456874466269, - "grad_norm": 0.337890625, - "learning_rate": 8.404964410489485e-05, - "loss": 0.9846, + "grad_norm": 0.23828125, + "learning_rate": 6.788385492426053e-06, + "loss": 0.8856, "step": 2610 }, { "epoch": 8.932536293766011, - "grad_norm": 0.302734375, - "learning_rate": 8.365639634074382e-05, - "loss": 0.9875, + "grad_norm": 0.2412109375, + "learning_rate": 6.573583496995816e-06, + "loss": 0.8887, "step": 2615 }, { "epoch": 8.949615713065755, - "grad_norm": 0.2890625, - "learning_rate": 8.32634080863919e-05, - "loss": 0.9764, + "grad_norm": 0.2412109375, + "learning_rate": 6.36211927943271e-06, + "loss": 0.8778, "step": 2620 }, { "epoch": 8.9666951323655, - "grad_norm": 0.353515625, - "learning_rate": 8.287068558185225e-05, - "loss": 0.9899, + "grad_norm": 0.244140625, + "learning_rate": 6.1540003945655286e-06, + "loss": 0.8906, "step": 2625 }, { "epoch": 8.983774551665244, - "grad_norm": 0.28515625, - "learning_rate": 8.247823506291844e-05, - "loss": 0.9813, + "grad_norm": 0.2421875, + "learning_rate": 5.949234277706861e-06, + "loss": 0.8818, "step": 2630 }, { "epoch": 8.997438087105039, - "eval_loss": 2.474276065826416, - "eval_runtime": 0.6699, - "eval_samples_per_second": 14.929, - "eval_steps_per_second": 1.493, + "eval_loss": 2.5981767177581787, + "eval_runtime": 0.5523, + "eval_samples_per_second": 18.106, + "eval_steps_per_second": 1.811, "step": 2634 }, { "epoch": 9.000853970964988, - "grad_norm": 0.31640625, - "learning_rate": 8.208606276106528e-05, - "loss": 0.9885, + "grad_norm": 0.2431640625, + "learning_rate": 5.74782824438731e-06, + "loss": 0.8865, "step": 2635 }, { "epoch": 9.017933390264732, - "grad_norm": 0.3359375, - "learning_rate": 8.169417490335007e-05, - "loss": 0.9783, + "grad_norm": 0.2392578125, + "learning_rate": 5.549789490094304e-06, + "loss": 0.8846, "step": 2640 }, { "epoch": 9.035012809564474, - "grad_norm": 0.322265625, - "learning_rate": 8.130257771231348e-05, - "loss": 0.9778, + "grad_norm": 0.2373046875, + "learning_rate": 5.355125090014845e-06, + "loss": 0.8845, "step": 2645 }, { "epoch": 9.052092228864218, - "grad_norm": 0.333984375, - "learning_rate": 8.091127740588094e-05, - "loss": 0.9793, + "grad_norm": 0.466796875, + "learning_rate": 5.163841998782837e-06, + "loss": 0.8852, "step": 2650 }, { "epoch": 9.069171648163962, - "grad_norm": 0.328125, - "learning_rate": 8.052028019726371e-05, - "loss": 0.9762, + "grad_norm": 0.23828125, + "learning_rate": 4.975947050230712e-06, + "loss": 0.8831, "step": 2655 }, { "epoch": 9.086251067463706, - "grad_norm": 0.291015625, - "learning_rate": 8.012959229486061e-05, - "loss": 0.9777, + "grad_norm": 0.234375, + "learning_rate": 4.79144695714504e-06, + "loss": 0.8838, "step": 2660 }, { "epoch": 9.10333048676345, - "grad_norm": 0.447265625, - "learning_rate": 7.973921990215894e-05, - "loss": 0.9871, + "grad_norm": 0.2392578125, + "learning_rate": 4.610348311026958e-06, + "loss": 0.8892, "step": 2665 }, { "epoch": 9.120409906063195, - "grad_norm": 0.306640625, - "learning_rate": 7.934916921763628e-05, - "loss": 0.9776, + "grad_norm": 0.2373046875, + "learning_rate": 4.432657581856525e-06, + "loss": 0.882, "step": 2670 }, { "epoch": 9.137489325362937, - "grad_norm": 0.31640625, - "learning_rate": 7.895944643466203e-05, - "loss": 0.9795, + "grad_norm": 0.2470703125, + "learning_rate": 4.25838111786162e-06, + "loss": 0.8839, "step": 2675 }, { "epoch": 9.154568744662681, - "grad_norm": 0.318359375, - "learning_rate": 7.857005774139907e-05, - "loss": 0.9804, + "grad_norm": 0.2431640625, + "learning_rate": 4.087525145291204e-06, + "loss": 0.8854, "step": 2680 }, { "epoch": 9.171648163962425, - "grad_norm": 0.35546875, - "learning_rate": 7.818100932070546e-05, - "loss": 0.9759, + "grad_norm": 0.244140625, + "learning_rate": 3.920095768192722e-06, + "loss": 0.8823, "step": 2685 }, { "epoch": 9.18872758326217, - "grad_norm": 0.28515625, - "learning_rate": 7.779230735003628e-05, - "loss": 0.9785, + "grad_norm": 0.2353515625, + "learning_rate": 3.7560989681941992e-06, + "loss": 0.883, "step": 2690 }, { "epoch": 9.205807002561913, - "grad_norm": 0.28515625, - "learning_rate": 7.740395800134552e-05, - "loss": 0.9751, + "grad_norm": 0.2353515625, + "learning_rate": 3.595540604290437e-06, + "loss": 0.8795, "step": 2695 }, { "epoch": 9.222886421861658, - "grad_norm": 0.30859375, - "learning_rate": 7.701596744098818e-05, - "loss": 0.9831, + "grad_norm": 0.23828125, + "learning_rate": 3.4384264126337328e-06, + "loss": 0.8868, "step": 2700 }, { "epoch": 9.2399658411614, - "grad_norm": 0.333984375, - "learning_rate": 7.662834182962222e-05, - "loss": 0.984, + "grad_norm": 0.255859375, + "learning_rate": 3.284762006328945e-06, + "loss": 0.8884, "step": 2705 }, { "epoch": 9.257045260461144, - "grad_norm": 0.314453125, - "learning_rate": 7.624108732211081e-05, - "loss": 0.9777, + "grad_norm": 0.267578125, + "learning_rate": 3.1345528752329212e-06, + "loss": 0.8819, "step": 2710 }, { "epoch": 9.274124679760888, - "grad_norm": 0.337890625, - "learning_rate": 7.585421006742463e-05, - "loss": 0.9859, + "grad_norm": 0.23828125, + "learning_rate": 2.9878043857584415e-06, + "loss": 0.8893, "step": 2715 }, { "epoch": 9.291204099060632, - "grad_norm": 0.365234375, - "learning_rate": 7.54677162085442e-05, - "loss": 0.9765, + "grad_norm": 0.23828125, + "learning_rate": 2.8445217806824077e-06, + "loss": 0.8805, "step": 2720 }, { "epoch": 9.308283518360376, - "grad_norm": 0.29296875, - "learning_rate": 7.508161188236232e-05, - "loss": 0.9749, + "grad_norm": 0.2373046875, + "learning_rate": 2.704710178958603e-06, + "loss": 0.8796, "step": 2725 }, { "epoch": 9.32536293766012, - "grad_norm": 0.333984375, - "learning_rate": 7.469590321958662e-05, - "loss": 0.9815, + "grad_norm": 0.2392578125, + "learning_rate": 2.5683745755348044e-06, + "loss": 0.8853, "step": 2730 }, { "epoch": 9.342442356959863, - "grad_norm": 0.287109375, - "learning_rate": 7.431059634464229e-05, - "loss": 0.9796, + "grad_norm": 0.2373046875, + "learning_rate": 2.435519841174272e-06, + "loss": 0.8844, "step": 2735 }, { "epoch": 9.359521776259607, - "grad_norm": 0.3125, - "learning_rate": 7.392569737557474e-05, - "loss": 0.9791, + "grad_norm": 0.255859375, + "learning_rate": 2.30615072228183e-06, + "loss": 0.8838, "step": 2740 }, { "epoch": 9.376601195559351, - "grad_norm": 0.373046875, - "learning_rate": 7.354121242395254e-05, - "loss": 0.9854, + "grad_norm": 0.24609375, + "learning_rate": 2.180271840734216e-06, + "loss": 0.8895, "step": 2745 }, { "epoch": 9.393680614859095, - "grad_norm": 0.328125, - "learning_rate": 7.31571475947703e-05, - "loss": 0.971, + "grad_norm": 0.24609375, + "learning_rate": 2.057887693714988e-06, + "loss": 0.876, "step": 2750 }, { "epoch": 9.410760034158839, - "grad_norm": 0.29296875, - "learning_rate": 7.277350898635178e-05, - "loss": 0.9789, + "grad_norm": 0.2373046875, + "learning_rate": 1.9390026535538674e-06, + "loss": 0.8831, "step": 2755 }, { "epoch": 9.427839453458583, - "grad_norm": 0.357421875, - "learning_rate": 7.239030269025311e-05, - "loss": 0.9808, + "grad_norm": 0.2451171875, + "learning_rate": 1.8236209675705274e-06, + "loss": 0.8851, "step": 2760 }, { "epoch": 9.444918872758326, - "grad_norm": 0.31640625, - "learning_rate": 7.200753479116593e-05, - "loss": 0.9712, + "grad_norm": 0.236328125, + "learning_rate": 1.7117467579228053e-06, + "loss": 0.876, "step": 2765 }, { "epoch": 9.46199829205807, - "grad_norm": 0.2890625, - "learning_rate": 7.162521136682085e-05, - "loss": 0.975, + "grad_norm": 0.2421875, + "learning_rate": 1.6033840214595308e-06, + "loss": 0.879, "step": 2770 }, { "epoch": 9.479077711357814, - "grad_norm": 0.306640625, - "learning_rate": 7.124333848789091e-05, - "loss": 0.9875, + "grad_norm": 0.240234375, + "learning_rate": 1.4985366295776404e-06, + "loss": 0.8899, "step": 2775 }, { "epoch": 9.496157130657558, - "grad_norm": 0.287109375, - "learning_rate": 7.08619222178954e-05, - "loss": 0.9806, + "grad_norm": 0.232421875, + "learning_rate": 1.397208328083921e-06, + "loss": 0.8836, "step": 2780 }, { "epoch": 9.513236549957302, - "grad_norm": 0.353515625, - "learning_rate": 7.048096861310322e-05, - "loss": 0.9861, + "grad_norm": 0.2373046875, + "learning_rate": 1.2994027370611173e-06, + "loss": 0.8905, "step": 2785 }, { "epoch": 9.530315969257046, - "grad_norm": 0.35546875, - "learning_rate": 7.010048372243698e-05, - "loss": 0.9823, + "grad_norm": 0.2392578125, + "learning_rate": 1.205123350738746e-06, + "loss": 0.8875, "step": 2790 }, { "epoch": 9.547395388556788, - "grad_norm": 0.29296875, - "learning_rate": 6.972047358737681e-05, - "loss": 0.9795, + "grad_norm": 0.2470703125, + "learning_rate": 1.114373537368063e-06, + "loss": 0.8838, "step": 2795 }, { "epoch": 9.564474807856532, - "grad_norm": 0.423828125, - "learning_rate": 6.934094424186459e-05, - "loss": 0.976, + "grad_norm": 0.234375, + "learning_rate": 1.0271565391018922e-06, + "loss": 0.8807, "step": 2800 }, { "epoch": 9.581554227156277, - "grad_norm": 0.349609375, - "learning_rate": 6.8961901712208e-05, - "loss": 0.9709, + "grad_norm": 0.2470703125, + "learning_rate": 9.434754718787409e-07, + "loss": 0.875, "step": 2805 }, { "epoch": 9.59863364645602, - "grad_norm": 0.3515625, - "learning_rate": 6.858335201698485e-05, - "loss": 0.9793, + "grad_norm": 0.25390625, + "learning_rate": 8.633333253113995e-07, + "loss": 0.8845, "step": 2810 }, { "epoch": 9.615713065755765, - "grad_norm": 0.341796875, - "learning_rate": 6.820530116694756e-05, - "loss": 0.9741, + "grad_norm": 0.23828125, + "learning_rate": 7.867329625802833e-07, + "loss": 0.88, "step": 2815 }, { "epoch": 9.632792485055509, - "grad_norm": 0.30859375, - "learning_rate": 6.782775516492771e-05, - "loss": 0.9755, + "grad_norm": 0.236328125, + "learning_rate": 7.136771203310245e-07, + "loss": 0.8794, "step": 2820 }, { "epoch": 9.649871904355251, - "grad_norm": 0.328125, - "learning_rate": 6.745072000574075e-05, - "loss": 0.9871, + "grad_norm": 0.2373046875, + "learning_rate": 6.441684085767396e-07, + "loss": 0.8894, "step": 2825 }, { "epoch": 9.666951323654995, - "grad_norm": 0.287109375, - "learning_rate": 6.70742016760907e-05, - "loss": 0.9754, + "grad_norm": 0.2431640625, + "learning_rate": 5.782093106048159e-07, + "loss": 0.8803, "step": 2830 }, { "epoch": 9.68403074295474, - "grad_norm": 0.29296875, - "learning_rate": 6.669820615447522e-05, - "loss": 0.9797, + "grad_norm": 0.251953125, + "learning_rate": 5.158021828881032e-07, + "loss": 0.8844, "step": 2835 }, { "epoch": 9.701110162254484, - "grad_norm": 0.333984375, - "learning_rate": 6.632273941109064e-05, - "loss": 0.9813, + "grad_norm": 0.2373046875, + "learning_rate": 4.569492550008603e-07, + "loss": 0.8835, "step": 2840 }, { "epoch": 9.718189581554228, - "grad_norm": 0.298828125, - "learning_rate": 6.594780740773712e-05, - "loss": 0.9792, + "grad_norm": 0.2373046875, + "learning_rate": 4.016526295389622e-07, + "loss": 0.8832, "step": 2845 }, { "epoch": 9.735269000853972, - "grad_norm": 0.298828125, - "learning_rate": 6.5573416097724e-05, - "loss": 0.9728, + "grad_norm": 0.2421875, + "learning_rate": 3.49914282044872e-07, + "loss": 0.879, "step": 2850 }, { "epoch": 9.752348420153714, - "grad_norm": 0.3046875, - "learning_rate": 6.519957142577535e-05, - "loss": 0.9767, + "grad_norm": 0.23828125, + "learning_rate": 3.017360609370301e-07, + "loss": 0.8805, "step": 2855 }, { "epoch": 9.769427839453458, - "grad_norm": 0.318359375, - "learning_rate": 6.482627932793553e-05, - "loss": 0.9822, + "grad_norm": 0.2373046875, + "learning_rate": 2.5711968744382974e-07, + "loss": 0.8853, "step": 2860 }, { "epoch": 9.786507258753202, - "grad_norm": 0.31640625, - "learning_rate": 6.445354573147484e-05, - "loss": 0.9855, + "grad_norm": 0.2412109375, + "learning_rate": 2.1606675554209922e-07, + "loss": 0.8901, "step": 2865 }, { "epoch": 9.803586678052946, - "grad_norm": 0.314453125, - "learning_rate": 6.408137655479554e-05, - "loss": 0.9796, + "grad_norm": 0.2392578125, + "learning_rate": 1.7857873190019192e-07, + "loss": 0.8816, "step": 2870 }, { "epoch": 9.82066609735269, - "grad_norm": 0.294921875, - "learning_rate": 6.370977770733777e-05, - "loss": 0.9791, + "grad_norm": 0.2353515625, + "learning_rate": 1.446569558255395e-07, + "loss": 0.8823, "step": 2875 }, { "epoch": 9.837745516652435, - "grad_norm": 0.294921875, - "learning_rate": 6.333875508948593e-05, - "loss": 0.9791, + "grad_norm": 0.23828125, + "learning_rate": 1.143026392168789e-07, + "loss": 0.8837, "step": 2880 }, { "epoch": 9.854824935952177, - "grad_norm": 0.3125, - "learning_rate": 6.296831459247464e-05, - "loss": 0.9791, + "grad_norm": 0.2333984375, + "learning_rate": 8.751686652084256e-08, + "loss": 0.8835, "step": 2885 }, { "epoch": 9.871904355251921, - "grad_norm": 0.33203125, - "learning_rate": 6.259846209829551e-05, - "loss": 0.9785, + "grad_norm": 0.2392578125, + "learning_rate": 6.430059469334504e-08, + "loss": 0.8839, "step": 2890 }, { "epoch": 9.888983774551665, - "grad_norm": 0.30078125, - "learning_rate": 6.22292034796035e-05, - "loss": 0.9827, + "grad_norm": 0.2373046875, + "learning_rate": 4.465465316529915e-08, + "loss": 0.8863, "step": 2895 }, { "epoch": 9.90606319385141, - "grad_norm": 0.298828125, - "learning_rate": 6.186054459962399e-05, - "loss": 0.9758, + "grad_norm": 0.2412109375, + "learning_rate": 2.8579743813006432e-08, + "loss": 0.8822, "step": 2900 }, { "epoch": 9.923142613151153, - "grad_norm": 0.287109375, - "learning_rate": 6.149249131205931e-05, - "loss": 0.9788, + "grad_norm": 0.244140625, + "learning_rate": 1.6076440933099345e-08, + "loss": 0.8817, "step": 2905 }, { "epoch": 9.940222032450897, - "grad_norm": 0.287109375, - "learning_rate": 6.112504946099604e-05, - "loss": 0.9773, + "grad_norm": 0.23828125, + "learning_rate": 7.145191222035497e-09, + "loss": 0.8827, "step": 2910 }, { "epoch": 9.95730145175064, - "grad_norm": 0.296875, - "learning_rate": 6.075822488081213e-05, - "loss": 0.9769, + "grad_norm": 0.2412109375, + "learning_rate": 1.7863137600993008e-09, + "loss": 0.881, "step": 2915 }, { "epoch": 9.974380871050384, - "grad_norm": 0.29296875, - "learning_rate": 6.039202339608432e-05, - "loss": 0.9809, + "grad_norm": 0.2421875, + "learning_rate": 0.0, + "loss": 0.8852, "step": 2920 }, { - "epoch": 9.991460290350128, - "grad_norm": 0.384765625, - "learning_rate": 6.0026450821495536e-05, - "loss": 0.9756, - "step": 2925 - }, - { - "epoch": 9.998292058070026, - "eval_loss": 2.4803388118743896, - "eval_runtime": 0.5616, - "eval_samples_per_second": 17.806, - "eval_steps_per_second": 1.781, - "step": 2927 - }, - { - "epoch": 10.008539709649872, - "grad_norm": 0.314453125, - "learning_rate": 5.966151296174268e-05, - "loss": 0.975, - "step": 2930 - }, - { - "epoch": 10.025619128949616, - "grad_norm": 0.310546875, - "learning_rate": 5.929721561144439e-05, - "loss": 0.9813, - "step": 2935 + "epoch": 9.974380871050384, + "eval_loss": 2.5989506244659424, + "eval_runtime": 0.5586, + "eval_samples_per_second": 17.903, + "eval_steps_per_second": 1.79, + "step": 2920 }, { - "epoch": 10.04269854824936, - "grad_norm": 0.291015625, - "learning_rate": 5.8933564555049105e-05, - "loss": 0.9799, - "step": 2940 - }, - { - "epoch": 10.059777967549103, - "grad_norm": 0.29296875, - "learning_rate": 5.857056556674313e-05, - "loss": 0.9724, - "step": 2945 - }, - { - "epoch": 10.076857386848847, - "grad_norm": 0.306640625, - "learning_rate": 5.820822441035899e-05, - "loss": 0.9755, - "step": 2950 - }, - { - "epoch": 10.09393680614859, - "grad_norm": 0.296875, - "learning_rate": 5.784654683928391e-05, - "loss": 0.9773, - "step": 2955 - }, - { - "epoch": 10.111016225448335, - "grad_norm": 0.287109375, - "learning_rate": 5.7485538596368496e-05, - "loss": 0.9698, - "step": 2960 - }, - { - "epoch": 10.128095644748079, - "grad_norm": 0.28125, - "learning_rate": 5.7125205413835504e-05, - "loss": 0.9774, - "step": 2965 - }, - { - "epoch": 10.145175064047823, - "grad_norm": 0.337890625, - "learning_rate": 5.6765553013188766e-05, - "loss": 0.9788, - "step": 2970 - }, - { - "epoch": 10.162254483347565, - "grad_norm": 0.30859375, - "learning_rate": 5.6406587105122475e-05, - "loss": 0.9814, - "step": 2975 - }, - { - "epoch": 10.17933390264731, - "grad_norm": 0.298828125, - "learning_rate": 5.6048313389430484e-05, - "loss": 0.9787, - "step": 2980 - }, - { - "epoch": 10.196413321947054, - "grad_norm": 0.310546875, - "learning_rate": 5.5690737554915604e-05, - "loss": 0.9741, - "step": 2985 - }, - { - "epoch": 10.213492741246798, - "grad_norm": 0.28515625, - "learning_rate": 5.533386527929962e-05, - "loss": 0.9735, - "step": 2990 - }, - { - "epoch": 10.230572160546542, - "grad_norm": 0.296875, - "learning_rate": 5.4977702229132745e-05, - "loss": 0.9736, - "step": 2995 - }, - { - "epoch": 10.247651579846286, - "grad_norm": 0.279296875, - "learning_rate": 5.462225405970401e-05, - "loss": 0.9754, - "step": 3000 - }, - { - "epoch": 10.264730999146028, - "grad_norm": 0.2734375, - "learning_rate": 5.4267526414951296e-05, - "loss": 0.9734, - "step": 3005 - }, - { - "epoch": 10.281810418445772, - "grad_norm": 0.298828125, - "learning_rate": 5.391352492737157e-05, - "loss": 0.9737, - "step": 3010 - }, - { - "epoch": 10.298889837745516, - "grad_norm": 0.322265625, - "learning_rate": 5.3560255217931785e-05, - "loss": 0.9729, - "step": 3015 - }, - { - "epoch": 10.31596925704526, - "grad_norm": 0.302734375, - "learning_rate": 5.3207722895979406e-05, - "loss": 0.9706, - "step": 3020 - }, - { - "epoch": 10.333048676345005, - "grad_norm": 0.287109375, - "learning_rate": 5.285593355915328e-05, - "loss": 0.976, - "step": 3025 - }, - { - "epoch": 10.350128095644749, - "grad_norm": 0.314453125, - "learning_rate": 5.2504892793295e-05, - "loss": 0.9657, - "step": 3030 - }, - { - "epoch": 10.367207514944491, - "grad_norm": 0.341796875, - "learning_rate": 5.215460617235993e-05, - "loss": 0.9731, - "step": 3035 - }, - { - "epoch": 10.384286934244235, - "grad_norm": 0.29296875, - "learning_rate": 5.1805079258329056e-05, - "loss": 0.978, - "step": 3040 - }, - { - "epoch": 10.40136635354398, - "grad_norm": 0.404296875, - "learning_rate": 5.145631760112022e-05, - "loss": 0.9757, - "step": 3045 - }, - { - "epoch": 10.418445772843723, - "grad_norm": 0.3125, - "learning_rate": 5.110832673850039e-05, - "loss": 0.9779, - "step": 3050 - }, - { - "epoch": 10.435525192143468, - "grad_norm": 0.294921875, - "learning_rate": 5.076111219599745e-05, - "loss": 0.9718, - "step": 3055 - }, - { - "epoch": 10.452604611443212, - "grad_norm": 0.359375, - "learning_rate": 5.041467948681269e-05, - "loss": 0.9796, - "step": 3060 - }, - { - "epoch": 10.469684030742954, - "grad_norm": 0.291015625, - "learning_rate": 5.0069034111733184e-05, - "loss": 0.9741, - "step": 3065 - }, - { - "epoch": 10.486763450042698, - "grad_norm": 0.283203125, - "learning_rate": 4.9724181559044234e-05, - "loss": 0.9757, - "step": 3070 - }, - { - "epoch": 10.503842869342442, - "grad_norm": 0.29296875, - "learning_rate": 4.9380127304442634e-05, - "loss": 0.9777, - "step": 3075 - }, - { - "epoch": 10.520922288642186, - "grad_norm": 0.27734375, - "learning_rate": 4.903687681094942e-05, - "loss": 0.973, - "step": 3080 - }, - { - "epoch": 10.53800170794193, - "grad_norm": 0.279296875, - "learning_rate": 4.8694435528823135e-05, - "loss": 0.9822, - "step": 3085 - }, - { - "epoch": 10.555081127241674, - "grad_norm": 0.29296875, - "learning_rate": 4.835280889547351e-05, - "loss": 0.9754, - "step": 3090 - }, - { - "epoch": 10.572160546541417, - "grad_norm": 0.28125, - "learning_rate": 4.801200233537483e-05, - "loss": 0.9711, - "step": 3095 - }, - { - "epoch": 10.589239965841161, - "grad_norm": 0.298828125, - "learning_rate": 4.767202125998005e-05, - "loss": 0.9782, - "step": 3100 - }, - { - "epoch": 10.606319385140905, - "grad_norm": 0.30078125, - "learning_rate": 4.733287106763481e-05, - "loss": 0.9784, - "step": 3105 - }, - { - "epoch": 10.623398804440649, - "grad_norm": 0.298828125, - "learning_rate": 4.699455714349152e-05, - "loss": 0.9724, - "step": 3110 - }, - { - "epoch": 10.640478223740393, - "grad_norm": 0.287109375, - "learning_rate": 4.665708485942417e-05, - "loss": 0.9729, - "step": 3115 - }, - { - "epoch": 10.657557643040137, - "grad_norm": 0.28515625, - "learning_rate": 4.6320459573942856e-05, - "loss": 0.9775, - "step": 3120 - }, - { - "epoch": 10.67463706233988, - "grad_norm": 0.3046875, - "learning_rate": 4.5984686632108585e-05, - "loss": 0.9749, - "step": 3125 - }, - { - "epoch": 10.691716481639624, - "grad_norm": 0.28515625, - "learning_rate": 4.564977136544873e-05, - "loss": 0.9796, - "step": 3130 - }, - { - "epoch": 10.708795900939368, - "grad_norm": 0.279296875, - "learning_rate": 4.531571909187197e-05, - "loss": 0.9731, - "step": 3135 - }, - { - "epoch": 10.725875320239112, - "grad_norm": 0.279296875, - "learning_rate": 4.49825351155843e-05, - "loss": 0.9734, - "step": 3140 - }, - { - "epoch": 10.742954739538856, - "grad_norm": 0.302734375, - "learning_rate": 4.4650224727004334e-05, - "loss": 0.9709, - "step": 3145 - }, - { - "epoch": 10.7600341588386, - "grad_norm": 0.28515625, - "learning_rate": 4.431879320267972e-05, - "loss": 0.9739, - "step": 3150 - }, - { - "epoch": 10.777113578138342, - "grad_norm": 0.3203125, - "learning_rate": 4.398824580520302e-05, - "loss": 0.9762, - "step": 3155 - }, - { - "epoch": 10.794192997438087, - "grad_norm": 0.287109375, - "learning_rate": 4.3658587783128425e-05, - "loss": 0.9749, - "step": 3160 - }, - { - "epoch": 10.81127241673783, - "grad_norm": 0.29296875, - "learning_rate": 4.332982437088825e-05, - "loss": 0.9769, - "step": 3165 - }, - { - "epoch": 10.828351836037575, - "grad_norm": 0.27734375, - "learning_rate": 4.300196078870982e-05, - "loss": 0.9783, - "step": 3170 - }, - { - "epoch": 10.845431255337319, - "grad_norm": 0.2890625, - "learning_rate": 4.267500224253269e-05, - "loss": 0.9817, - "step": 3175 - }, - { - "epoch": 10.862510674637063, - "grad_norm": 0.28125, - "learning_rate": 4.2348953923925916e-05, - "loss": 0.9776, - "step": 3180 - }, - { - "epoch": 10.879590093936805, - "grad_norm": 0.2890625, - "learning_rate": 4.202382101000554e-05, - "loss": 0.9771, - "step": 3185 - }, - { - "epoch": 10.89666951323655, - "grad_norm": 0.2890625, - "learning_rate": 4.16996086633526e-05, - "loss": 0.9716, - "step": 3190 - }, - { - "epoch": 10.913748932536294, - "grad_norm": 0.28125, - "learning_rate": 4.137632203193086e-05, - "loss": 0.9845, - "step": 3195 - }, - { - "epoch": 10.930828351836038, - "grad_norm": 0.330078125, - "learning_rate": 4.105396624900538e-05, - "loss": 0.9799, - "step": 3200 - }, - { - "epoch": 10.947907771135782, - "grad_norm": 0.314453125, - "learning_rate": 4.073254643306086e-05, - "loss": 0.97, - "step": 3205 - }, - { - "epoch": 10.964987190435526, - "grad_norm": 0.28515625, - "learning_rate": 4.041206768772022e-05, - "loss": 0.977, - "step": 3210 - }, - { - "epoch": 10.98206660973527, - "grad_norm": 0.28125, - "learning_rate": 4.009253510166386e-05, - "loss": 0.9714, - "step": 3215 - }, - { - "epoch": 10.999146029035012, - "grad_norm": 0.279296875, - "learning_rate": 3.977395374854871e-05, - "loss": 0.9815, - "step": 3220 - }, - { - "epoch": 10.999146029035012, - "eval_loss": 2.4823217391967773, - "eval_runtime": 0.5642, - "eval_samples_per_second": 17.723, - "eval_steps_per_second": 1.772, - "step": 3220 - }, - { - "epoch": 11.016225448334756, - "grad_norm": 0.28125, - "learning_rate": 3.9456328686927525e-05, - "loss": 0.9669, - "step": 3225 - }, - { - "epoch": 11.0333048676345, - "grad_norm": 0.275390625, - "learning_rate": 3.913966496016891e-05, - "loss": 0.9682, - "step": 3230 - }, - { - "epoch": 11.050384286934245, - "grad_norm": 0.34375, - "learning_rate": 3.88239675963768e-05, - "loss": 0.9733, - "step": 3235 - }, - { - "epoch": 11.067463706233989, - "grad_norm": 0.291015625, - "learning_rate": 3.850924160831115e-05, - "loss": 0.9732, - "step": 3240 - }, - { - "epoch": 11.084543125533731, - "grad_norm": 0.283203125, - "learning_rate": 3.819549199330784e-05, - "loss": 0.9702, - "step": 3245 - }, - { - "epoch": 11.101622544833475, - "grad_norm": 0.302734375, - "learning_rate": 3.788272373319955e-05, - "loss": 0.974, - "step": 3250 - }, - { - "epoch": 11.11870196413322, - "grad_norm": 0.294921875, - "learning_rate": 3.757094179423672e-05, - "loss": 0.9747, - "step": 3255 - }, - { - "epoch": 11.135781383432963, - "grad_norm": 0.306640625, - "learning_rate": 3.726015112700859e-05, - "loss": 0.9854, - "step": 3260 - }, - { - "epoch": 11.152860802732707, - "grad_norm": 0.298828125, - "learning_rate": 3.695035666636464e-05, - "loss": 0.9724, - "step": 3265 - }, - { - "epoch": 11.169940222032452, - "grad_norm": 0.298828125, - "learning_rate": 3.6641563331336125e-05, - "loss": 0.9668, - "step": 3270 - }, - { - "epoch": 11.187019641332194, - "grad_norm": 0.28515625, - "learning_rate": 3.633377602505815e-05, - "loss": 0.9708, - "step": 3275 - }, - { - "epoch": 11.204099060631938, - "grad_norm": 0.287109375, - "learning_rate": 3.6026999634691725e-05, - "loss": 0.9728, - "step": 3280 - }, - { - "epoch": 11.221178479931682, - "grad_norm": 0.28515625, - "learning_rate": 3.5721239031346066e-05, - "loss": 0.987, - "step": 3285 - }, - { - "epoch": 11.238257899231426, - "grad_norm": 0.302734375, - "learning_rate": 3.541649907000147e-05, - "loss": 0.9808, - "step": 3290 - }, - { - "epoch": 11.25533731853117, - "grad_norm": 0.2890625, - "learning_rate": 3.511278458943197e-05, - "loss": 0.9693, - "step": 3295 - }, - { - "epoch": 11.272416737830914, - "grad_norm": 0.279296875, - "learning_rate": 3.4810100412128747e-05, - "loss": 0.9766, - "step": 3300 - }, - { - "epoch": 11.289496157130657, - "grad_norm": 0.310546875, - "learning_rate": 3.4508451344223425e-05, - "loss": 0.9756, - "step": 3305 - }, - { - "epoch": 11.3065755764304, - "grad_norm": 0.298828125, - "learning_rate": 3.42078421754117e-05, - "loss": 0.9723, - "step": 3310 - }, - { - "epoch": 11.323654995730145, - "grad_norm": 0.27734375, - "learning_rate": 3.3908277678877445e-05, - "loss": 0.972, - "step": 3315 - }, - { - "epoch": 11.340734415029889, - "grad_norm": 0.283203125, - "learning_rate": 3.360976261121684e-05, - "loss": 0.9771, - "step": 3320 - }, - { - "epoch": 11.357813834329633, - "grad_norm": 0.298828125, - "learning_rate": 3.331230171236277e-05, - "loss": 0.9764, - "step": 3325 - }, - { - "epoch": 11.374893253629377, - "grad_norm": 0.29296875, - "learning_rate": 3.3015899705509734e-05, - "loss": 0.9716, - "step": 3330 - }, - { - "epoch": 11.391972672929121, - "grad_norm": 0.279296875, - "learning_rate": 3.272056129703861e-05, - "loss": 0.9743, - "step": 3335 - }, - { - "epoch": 11.409052092228864, - "grad_norm": 0.27734375, - "learning_rate": 3.242629117644229e-05, - "loss": 0.9731, - "step": 3340 - }, - { - "epoch": 11.426131511528608, - "grad_norm": 0.306640625, - "learning_rate": 3.21330940162508e-05, - "loss": 0.9776, - "step": 3345 - }, - { - "epoch": 11.443210930828352, - "grad_norm": 0.283203125, - "learning_rate": 3.184097447195732e-05, - "loss": 0.9695, - "step": 3350 - }, - { - "epoch": 11.460290350128096, - "grad_norm": 0.302734375, - "learning_rate": 3.1549937181944346e-05, - "loss": 0.9792, - "step": 3355 - }, - { - "epoch": 11.47736976942784, - "grad_norm": 0.29296875, - "learning_rate": 3.125998676740987e-05, - "loss": 0.9689, - "step": 3360 - }, - { - "epoch": 11.494449188727582, - "grad_norm": 0.3046875, - "learning_rate": 3.097112783229412e-05, - "loss": 0.9706, - "step": 3365 - }, - { - "epoch": 11.511528608027326, - "grad_norm": 0.28515625, - "learning_rate": 3.068336496320631e-05, - "loss": 0.975, - "step": 3370 - }, - { - "epoch": 11.52860802732707, - "grad_norm": 0.2734375, - "learning_rate": 3.0396702729352023e-05, - "loss": 0.9742, - "step": 3375 - }, - { - "epoch": 11.545687446626815, - "grad_norm": 0.279296875, - "learning_rate": 3.0111145682460507e-05, - "loss": 0.9709, - "step": 3380 - }, - { - "epoch": 11.562766865926559, - "grad_norm": 0.28515625, - "learning_rate": 2.9826698356712403e-05, - "loss": 0.9664, - "step": 3385 - }, - { - "epoch": 11.579846285226303, - "grad_norm": 0.30859375, - "learning_rate": 2.9543365268667867e-05, - "loss": 0.973, - "step": 3390 - }, - { - "epoch": 11.596925704526047, - "grad_norm": 0.291015625, - "learning_rate": 2.926115091719467e-05, - "loss": 0.9723, - "step": 3395 - }, - { - "epoch": 11.61400512382579, - "grad_norm": 0.29296875, - "learning_rate": 2.8980059783396953e-05, - "loss": 0.9707, - "step": 3400 - }, - { - "epoch": 11.631084543125533, - "grad_norm": 0.279296875, - "learning_rate": 2.8700096330544012e-05, - "loss": 0.9702, - "step": 3405 - }, - { - "epoch": 11.648163962425278, - "grad_norm": 0.283203125, - "learning_rate": 2.8421265003999286e-05, - "loss": 0.9788, - "step": 3410 - }, - { - "epoch": 11.665243381725022, - "grad_norm": 0.298828125, - "learning_rate": 2.8143570231150006e-05, - "loss": 0.9708, - "step": 3415 - }, - { - "epoch": 11.682322801024766, - "grad_norm": 0.291015625, - "learning_rate": 2.7867016421336776e-05, - "loss": 0.9803, - "step": 3420 - }, - { - "epoch": 11.699402220324508, - "grad_norm": 0.283203125, - "learning_rate": 2.759160796578347e-05, - "loss": 0.9667, - "step": 3425 - }, - { - "epoch": 11.716481639624252, - "grad_norm": 0.271484375, - "learning_rate": 2.7317349237527724e-05, - "loss": 0.9794, - "step": 3430 - }, - { - "epoch": 11.733561058923996, - "grad_norm": 0.271484375, - "learning_rate": 2.7044244591351232e-05, - "loss": 0.9736, - "step": 3435 - }, - { - "epoch": 11.75064047822374, - "grad_norm": 0.2890625, - "learning_rate": 2.6772298363710956e-05, - "loss": 0.976, - "step": 3440 - }, - { - "epoch": 11.767719897523484, - "grad_norm": 0.275390625, - "learning_rate": 2.6501514872669865e-05, - "loss": 0.9744, - "step": 3445 - }, - { - "epoch": 11.784799316823229, - "grad_norm": 0.287109375, - "learning_rate": 2.6231898417828603e-05, - "loss": 0.9771, - "step": 3450 - }, - { - "epoch": 11.801878736122973, - "grad_norm": 0.28515625, - "learning_rate": 2.5963453280257267e-05, - "loss": 0.9744, - "step": 3455 - }, - { - "epoch": 11.818958155422715, - "grad_norm": 0.28125, - "learning_rate": 2.569618372242727e-05, - "loss": 0.9797, - "step": 3460 - }, - { - "epoch": 11.836037574722459, - "grad_norm": 0.302734375, - "learning_rate": 2.5430093988143778e-05, - "loss": 0.9775, - "step": 3465 - }, - { - "epoch": 11.853116994022203, - "grad_norm": 0.27734375, - "learning_rate": 2.5165188302478215e-05, - "loss": 0.9702, - "step": 3470 - }, - { - "epoch": 11.870196413321947, - "grad_norm": 0.275390625, - "learning_rate": 2.4901470871701305e-05, - "loss": 0.9738, - "step": 3475 - }, - { - "epoch": 11.887275832621691, - "grad_norm": 0.279296875, - "learning_rate": 2.4638945883216235e-05, - "loss": 0.968, - "step": 3480 - }, - { - "epoch": 11.904355251921434, - "grad_norm": 0.279296875, - "learning_rate": 2.4377617505492046e-05, - "loss": 0.9707, - "step": 3485 - }, - { - "epoch": 11.921434671221178, - "grad_norm": 0.298828125, - "learning_rate": 2.411748988799769e-05, - "loss": 0.9727, - "step": 3490 - }, - { - "epoch": 11.938514090520922, - "grad_norm": 0.28515625, - "learning_rate": 2.385856716113587e-05, - "loss": 0.9754, - "step": 3495 - }, - { - "epoch": 11.955593509820666, - "grad_norm": 0.2890625, - "learning_rate": 2.3600853436177672e-05, - "loss": 0.9699, - "step": 3500 - }, - { - "epoch": 11.97267292912041, - "grad_norm": 0.287109375, - "learning_rate": 2.3344352805197212e-05, - "loss": 0.9726, - "step": 3505 - }, - { - "epoch": 11.989752348420154, - "grad_norm": 0.30078125, - "learning_rate": 2.3089069341006565e-05, - "loss": 0.9657, - "step": 3510 - }, - { - "epoch": 12.0, - "eval_loss": 2.4844307899475098, - "eval_runtime": 0.551, - "eval_samples_per_second": 18.15, - "eval_steps_per_second": 1.815, - "step": 3513 - }, - { - "epoch": 12.006831767719898, - "grad_norm": 0.283203125, - "learning_rate": 2.2835007097091267e-05, - "loss": 0.9726, - "step": 3515 - }, - { - "epoch": 12.02391118701964, - "grad_norm": 0.28125, - "learning_rate": 2.2582170107545852e-05, - "loss": 0.9707, - "step": 3520 - }, - { - "epoch": 12.040990606319385, - "grad_norm": 0.271484375, - "learning_rate": 2.2330562387009745e-05, - "loss": 0.9734, - "step": 3525 - }, - { - "epoch": 12.058070025619129, - "grad_norm": 0.28125, - "learning_rate": 2.2080187930603668e-05, - "loss": 0.9771, - "step": 3530 - }, - { - "epoch": 12.075149444918873, - "grad_norm": 0.275390625, - "learning_rate": 2.1831050713866007e-05, - "loss": 0.9756, - "step": 3535 - }, - { - "epoch": 12.092228864218617, - "grad_norm": 0.291015625, - "learning_rate": 2.1583154692689976e-05, - "loss": 0.9753, - "step": 3540 - }, - { - "epoch": 12.109308283518361, - "grad_norm": 0.2734375, - "learning_rate": 2.1336503803260456e-05, - "loss": 0.9644, - "step": 3545 - }, - { - "epoch": 12.126387702818104, - "grad_norm": 0.287109375, - "learning_rate": 2.109110196199171e-05, - "loss": 0.9691, - "step": 3550 - }, - { - "epoch": 12.143467122117848, - "grad_norm": 0.2890625, - "learning_rate": 2.08469530654652e-05, - "loss": 0.9801, - "step": 3555 - }, - { - "epoch": 12.160546541417592, - "grad_norm": 0.294921875, - "learning_rate": 2.0604060990367624e-05, - "loss": 0.966, - "step": 3560 - }, - { - "epoch": 12.177625960717336, - "grad_norm": 0.28125, - "learning_rate": 2.0362429593429432e-05, - "loss": 0.9698, - "step": 3565 - }, - { - "epoch": 12.19470538001708, - "grad_norm": 0.275390625, - "learning_rate": 2.0122062711363532e-05, - "loss": 0.9717, - "step": 3570 - }, - { - "epoch": 12.211784799316824, - "grad_norm": 0.275390625, - "learning_rate": 1.988296416080435e-05, - "loss": 0.9793, - "step": 3575 - }, - { - "epoch": 12.228864218616566, - "grad_norm": 0.294921875, - "learning_rate": 1.9645137738247422e-05, - "loss": 0.9743, - "step": 3580 - }, - { - "epoch": 12.24594363791631, - "grad_norm": 0.287109375, - "learning_rate": 1.9408587219988805e-05, - "loss": 0.969, - "step": 3585 - }, - { - "epoch": 12.263023057216055, - "grad_norm": 0.27734375, - "learning_rate": 1.9173316362065384e-05, - "loss": 0.9621, - "step": 3590 - }, - { - "epoch": 12.280102476515799, - "grad_norm": 0.28125, - "learning_rate": 1.893932890019503e-05, - "loss": 0.9786, - "step": 3595 - }, - { - "epoch": 12.297181895815543, - "grad_norm": 0.27734375, - "learning_rate": 1.8706628549717452e-05, - "loss": 0.971, - "step": 3600 - }, - { - "epoch": 12.314261315115287, - "grad_norm": 0.279296875, - "learning_rate": 1.8475219005535117e-05, - "loss": 0.9797, - "step": 3605 - }, - { - "epoch": 12.33134073441503, - "grad_norm": 0.291015625, - "learning_rate": 1.824510394205453e-05, - "loss": 0.9727, - "step": 3610 - }, - { - "epoch": 12.348420153714773, - "grad_norm": 0.28515625, - "learning_rate": 1.8016287013128018e-05, - "loss": 0.9723, - "step": 3615 - }, - { - "epoch": 12.365499573014517, - "grad_norm": 0.2734375, - "learning_rate": 1.7788771851995655e-05, - "loss": 0.9764, - "step": 3620 - }, - { - "epoch": 12.382578992314262, - "grad_norm": 0.279296875, - "learning_rate": 1.7562562071227474e-05, - "loss": 0.9665, - "step": 3625 - }, - { - "epoch": 12.399658411614006, - "grad_norm": 0.291015625, - "learning_rate": 1.7337661262666294e-05, - "loss": 0.9737, - "step": 3630 - }, - { - "epoch": 12.41673783091375, - "grad_norm": 0.275390625, - "learning_rate": 1.711407299737049e-05, - "loss": 0.9671, - "step": 3635 - }, - { - "epoch": 12.433817250213492, - "grad_norm": 0.283203125, - "learning_rate": 1.6891800825557535e-05, - "loss": 0.975, - "step": 3640 - }, - { - "epoch": 12.450896669513236, - "grad_norm": 0.28515625, - "learning_rate": 1.6670848276547334e-05, - "loss": 0.9767, - "step": 3645 - }, - { - "epoch": 12.46797608881298, - "grad_norm": 0.2734375, - "learning_rate": 1.6451218858706374e-05, - "loss": 0.9773, - "step": 3650 - }, - { - "epoch": 12.485055508112724, - "grad_norm": 0.30078125, - "learning_rate": 1.6232916059392e-05, - "loss": 0.9752, - "step": 3655 - }, - { - "epoch": 12.502134927412468, - "grad_norm": 0.279296875, - "learning_rate": 1.601594334489702e-05, - "loss": 0.9632, - "step": 3660 - }, - { - "epoch": 12.519214346712213, - "grad_norm": 0.28125, - "learning_rate": 1.5800304160394673e-05, - "loss": 0.9749, - "step": 3665 - }, - { - "epoch": 12.536293766011955, - "grad_norm": 0.279296875, - "learning_rate": 1.5586001929883865e-05, - "loss": 0.9746, - "step": 3670 - }, - { - "epoch": 12.553373185311699, - "grad_norm": 0.287109375, - "learning_rate": 1.5373040056134814e-05, - "loss": 0.9783, - "step": 3675 - }, - { - "epoch": 12.570452604611443, - "grad_norm": 0.283203125, - "learning_rate": 1.516142192063521e-05, - "loss": 0.974, - "step": 3680 - }, - { - "epoch": 12.587532023911187, - "grad_norm": 0.2890625, - "learning_rate": 1.4951150883536225e-05, - "loss": 0.9802, - "step": 3685 - }, - { - "epoch": 12.604611443210931, - "grad_norm": 0.279296875, - "learning_rate": 1.474223028359939e-05, - "loss": 0.9729, - "step": 3690 - }, - { - "epoch": 12.621690862510675, - "grad_norm": 0.28125, - "learning_rate": 1.4534663438143415e-05, - "loss": 0.9711, - "step": 3695 - }, - { - "epoch": 12.638770281810418, - "grad_norm": 0.29296875, - "learning_rate": 1.4328453642991646e-05, - "loss": 0.969, - "step": 3700 - }, - { - "epoch": 12.655849701110162, - "grad_norm": 0.30078125, - "learning_rate": 1.4123604172419713e-05, - "loss": 0.9795, - "step": 3705 - }, - { - "epoch": 12.672929120409906, - "grad_norm": 0.271484375, - "learning_rate": 1.392011827910341e-05, - "loss": 0.9715, - "step": 3710 - }, - { - "epoch": 12.69000853970965, - "grad_norm": 0.296875, - "learning_rate": 1.3717999194067232e-05, - "loss": 0.9644, - "step": 3715 - }, - { - "epoch": 12.707087959009394, - "grad_norm": 0.28125, - "learning_rate": 1.3517250126632986e-05, - "loss": 0.9717, - "step": 3720 - }, - { - "epoch": 12.724167378309138, - "grad_norm": 0.28125, - "learning_rate": 1.3317874264368734e-05, - "loss": 0.9716, - "step": 3725 - }, - { - "epoch": 12.74124679760888, - "grad_norm": 0.27734375, - "learning_rate": 1.311987477303842e-05, - "loss": 0.9772, - "step": 3730 - }, - { - "epoch": 12.758326216908625, - "grad_norm": 0.275390625, - "learning_rate": 1.292325479655131e-05, - "loss": 0.9656, - "step": 3735 - }, - { - "epoch": 12.775405636208369, - "grad_norm": 0.30078125, - "learning_rate": 1.2728017456912344e-05, - "loss": 0.9738, - "step": 3740 - }, - { - "epoch": 12.792485055508113, - "grad_norm": 0.28515625, - "learning_rate": 1.2534165854172397e-05, - "loss": 0.9709, - "step": 3745 - }, - { - "epoch": 12.809564474807857, - "grad_norm": 0.275390625, - "learning_rate": 1.2341703066379074e-05, - "loss": 0.9714, - "step": 3750 - }, - { - "epoch": 12.826643894107601, - "grad_norm": 0.298828125, - "learning_rate": 1.2150632149527886e-05, - "loss": 0.9722, - "step": 3755 - }, - { - "epoch": 12.843723313407343, - "grad_norm": 0.27734375, - "learning_rate": 1.1960956137513701e-05, - "loss": 0.978, - "step": 3760 - }, - { - "epoch": 12.860802732707088, - "grad_norm": 0.279296875, - "learning_rate": 1.1772678042082607e-05, - "loss": 0.9668, - "step": 3765 - }, - { - "epoch": 12.877882152006832, - "grad_norm": 0.283203125, - "learning_rate": 1.158580085278398e-05, - "loss": 0.9748, - "step": 3770 - }, - { - "epoch": 12.894961571306576, - "grad_norm": 0.265625, - "learning_rate": 1.1400327536923083e-05, - "loss": 0.9702, - "step": 3775 - }, - { - "epoch": 12.91204099060632, - "grad_norm": 0.318359375, - "learning_rate": 1.1216261039514087e-05, - "loss": 0.97, - "step": 3780 - }, - { - "epoch": 12.929120409906064, - "grad_norm": 0.283203125, - "learning_rate": 1.1033604283233035e-05, - "loss": 0.968, - "step": 3785 - }, - { - "epoch": 12.946199829205806, - "grad_norm": 0.2890625, - "learning_rate": 1.0852360168371656e-05, - "loss": 0.9732, - "step": 3790 - }, - { - "epoch": 12.96327924850555, - "grad_norm": 0.28515625, - "learning_rate": 1.0672531572791178e-05, - "loss": 0.9739, - "step": 3795 - }, - { - "epoch": 12.980358667805294, - "grad_norm": 0.2734375, - "learning_rate": 1.049412135187675e-05, - "loss": 0.9687, - "step": 3800 - }, - { - "epoch": 12.997438087105039, - "grad_norm": 0.28125, - "learning_rate": 1.0317132338492019e-05, - "loss": 0.9694, - "step": 3805 - }, - { - "epoch": 12.997438087105039, - "eval_loss": 2.4820432662963867, - "eval_runtime": 0.553, - "eval_samples_per_second": 18.083, - "eval_steps_per_second": 1.808, - "step": 3805 - }, - { - "epoch": 13.014517506404783, - "grad_norm": 0.26953125, - "learning_rate": 1.0141567342934132e-05, - "loss": 0.9741, - "step": 3810 - }, - { - "epoch": 13.031596925704527, - "grad_norm": 0.283203125, - "learning_rate": 9.967429152889208e-06, - "loss": 0.9639, - "step": 3815 - }, - { - "epoch": 13.048676345004269, - "grad_norm": 0.30078125, - "learning_rate": 9.794720533388024e-06, - "loss": 0.9852, - "step": 3820 - }, - { - "epoch": 13.065755764304013, - "grad_norm": 0.27734375, - "learning_rate": 9.623444226762035e-06, - "loss": 0.9699, - "step": 3825 - }, - { - "epoch": 13.082835183603757, - "grad_norm": 0.27734375, - "learning_rate": 9.453602952599982e-06, - "loss": 0.9677, - "step": 3830 - }, - { - "epoch": 13.099914602903501, - "grad_norm": 0.28125, - "learning_rate": 9.285199407704558e-06, - "loss": 0.9795, - "step": 3835 - }, - { - "epoch": 13.116994022203246, - "grad_norm": 0.267578125, - "learning_rate": 9.118236266049707e-06, - "loss": 0.9746, - "step": 3840 - }, - { - "epoch": 13.13407344150299, - "grad_norm": 0.283203125, - "learning_rate": 8.95271617873813e-06, - "loss": 0.9759, - "step": 3845 - }, - { - "epoch": 13.151152860802732, - "grad_norm": 0.2734375, - "learning_rate": 8.788641773959105e-06, - "loss": 0.966, - "step": 3850 - }, - { - "epoch": 13.168232280102476, - "grad_norm": 0.2890625, - "learning_rate": 8.626015656946895e-06, - "loss": 0.9667, - "step": 3855 - }, - { - "epoch": 13.18531169940222, - "grad_norm": 0.2734375, - "learning_rate": 8.464840409939267e-06, - "loss": 0.9725, - "step": 3860 - }, - { - "epoch": 13.202391118701964, - "grad_norm": 0.279296875, - "learning_rate": 8.305118592136597e-06, - "loss": 0.9682, - "step": 3865 - }, - { - "epoch": 13.219470538001708, - "grad_norm": 0.27734375, - "learning_rate": 8.146852739661105e-06, - "loss": 0.9727, - "step": 3870 - }, - { - "epoch": 13.236549957301452, - "grad_norm": 0.26953125, - "learning_rate": 7.99004536551664e-06, - "loss": 0.9664, - "step": 3875 - }, - { - "epoch": 13.253629376601195, - "grad_norm": 0.26953125, - "learning_rate": 7.834698959548914e-06, - "loss": 0.9708, - "step": 3880 - }, - { - "epoch": 13.270708795900939, - "grad_norm": 0.267578125, - "learning_rate": 7.6808159884057e-06, - "loss": 0.9755, - "step": 3885 - }, - { - "epoch": 13.287788215200683, - "grad_norm": 0.2890625, - "learning_rate": 7.528398895497924e-06, - "loss": 0.9687, - "step": 3890 - }, - { - "epoch": 13.304867634500427, - "grad_norm": 0.271484375, - "learning_rate": 7.377450100960648e-06, - "loss": 0.9743, - "step": 3895 - }, - { - "epoch": 13.321947053800171, - "grad_norm": 0.29296875, - "learning_rate": 7.2279720016148244e-06, - "loss": 0.9716, - "step": 3900 - }, - { - "epoch": 13.339026473099915, - "grad_norm": 0.296875, - "learning_rate": 7.079966970929175e-06, - "loss": 0.9706, - "step": 3905 - }, - { - "epoch": 13.356105892399658, - "grad_norm": 0.275390625, - "learning_rate": 6.933437358982409e-06, - "loss": 0.9724, - "step": 3910 - }, - { - "epoch": 13.373185311699402, - "grad_norm": 0.283203125, - "learning_rate": 6.788385492426053e-06, - "loss": 0.9797, - "step": 3915 - }, - { - "epoch": 13.390264730999146, - "grad_norm": 0.322265625, - "learning_rate": 6.6448136744474474e-06, - "loss": 0.9764, - "step": 3920 - }, - { - "epoch": 13.40734415029889, - "grad_norm": 0.29296875, - "learning_rate": 6.502724184733122e-06, - "loss": 0.9751, - "step": 3925 - }, - { - "epoch": 13.424423569598634, - "grad_norm": 0.279296875, - "learning_rate": 6.36211927943271e-06, - "loss": 0.9724, - "step": 3930 - }, - { - "epoch": 13.441502988898378, - "grad_norm": 0.275390625, - "learning_rate": 6.223001191123012e-06, - "loss": 0.9804, - "step": 3935 - }, - { - "epoch": 13.45858240819812, - "grad_norm": 0.279296875, - "learning_rate": 6.085372128772637e-06, - "loss": 0.9742, - "step": 3940 - }, - { - "epoch": 13.475661827497865, - "grad_norm": 0.279296875, - "learning_rate": 5.949234277706861e-06, - "loss": 0.9763, - "step": 3945 - }, - { - "epoch": 13.492741246797609, - "grad_norm": 0.283203125, - "learning_rate": 5.814589799572956e-06, - "loss": 0.9779, - "step": 3950 - }, - { - "epoch": 13.509820666097353, - "grad_norm": 0.283203125, - "learning_rate": 5.681440832305873e-06, - "loss": 0.9769, - "step": 3955 - }, - { - "epoch": 13.526900085397097, - "grad_norm": 0.287109375, - "learning_rate": 5.549789490094304e-06, - "loss": 0.9766, - "step": 3960 - }, - { - "epoch": 13.543979504696841, - "grad_norm": 0.283203125, - "learning_rate": 5.41963786334706e-06, - "loss": 0.9655, - "step": 3965 - }, - { - "epoch": 13.561058923996583, - "grad_norm": 0.28125, - "learning_rate": 5.290988018659937e-06, - "loss": 0.9738, - "step": 3970 - }, - { - "epoch": 13.578138343296327, - "grad_norm": 0.333984375, - "learning_rate": 5.163841998782837e-06, - "loss": 0.9729, - "step": 3975 - }, - { - "epoch": 13.595217762596072, - "grad_norm": 0.298828125, - "learning_rate": 5.038201822587474e-06, - "loss": 0.9685, - "step": 3980 - }, - { - "epoch": 13.612297181895816, - "grad_norm": 0.271484375, - "learning_rate": 4.914069485035111e-06, - "loss": 0.9698, - "step": 3985 - }, - { - "epoch": 13.62937660119556, - "grad_norm": 0.265625, - "learning_rate": 4.79144695714504e-06, - "loss": 0.9747, - "step": 3990 - }, - { - "epoch": 13.646456020495304, - "grad_norm": 0.2734375, - "learning_rate": 4.67033618596322e-06, - "loss": 0.9716, - "step": 3995 - }, - { - "epoch": 13.663535439795046, - "grad_norm": 0.2734375, - "learning_rate": 4.550739094531386e-06, - "loss": 0.9771, - "step": 4000 - }, - { - "epoch": 13.68061485909479, - "grad_norm": 0.275390625, - "learning_rate": 4.432657581856525e-06, - "loss": 0.9703, - "step": 4005 - }, - { - "epoch": 13.697694278394534, - "grad_norm": 0.30078125, - "learning_rate": 4.316093522880648e-06, - "loss": 0.9621, - "step": 4010 - }, - { - "epoch": 13.714773697694278, - "grad_norm": 0.28515625, - "learning_rate": 4.20104876845111e-06, - "loss": 0.977, - "step": 4015 - }, - { - "epoch": 13.731853116994023, - "grad_norm": 0.275390625, - "learning_rate": 4.087525145291204e-06, - "loss": 0.9682, - "step": 4020 - }, - { - "epoch": 13.748932536293767, - "grad_norm": 0.2890625, - "learning_rate": 3.97552445597108e-06, - "loss": 0.9693, - "step": 4025 - }, - { - "epoch": 13.766011955593509, - "grad_norm": 0.28515625, - "learning_rate": 3.865048478879241e-06, - "loss": 0.9684, - "step": 4030 - }, - { - "epoch": 13.783091374893253, - "grad_norm": 0.287109375, - "learning_rate": 3.7560989681941992e-06, - "loss": 0.9724, - "step": 4035 - }, - { - "epoch": 13.800170794192997, - "grad_norm": 0.26953125, - "learning_rate": 3.6486776538566803e-06, - "loss": 0.977, - "step": 4040 - }, - { - "epoch": 13.817250213492741, - "grad_norm": 0.271484375, - "learning_rate": 3.542786241542162e-06, - "loss": 0.9676, - "step": 4045 - }, - { - "epoch": 13.834329632792485, - "grad_norm": 0.27734375, - "learning_rate": 3.4384264126337328e-06, - "loss": 0.967, - "step": 4050 - }, - { - "epoch": 13.85140905209223, - "grad_norm": 0.2734375, - "learning_rate": 3.3355998241954678e-06, - "loss": 0.971, - "step": 4055 - }, - { - "epoch": 13.868488471391974, - "grad_norm": 0.27734375, - "learning_rate": 3.2343081089460603e-06, - "loss": 0.9702, - "step": 4060 - }, - { - "epoch": 13.885567890691716, - "grad_norm": 0.275390625, - "learning_rate": 3.1345528752329212e-06, - "loss": 0.9733, - "step": 4065 - }, - { - "epoch": 13.90264730999146, - "grad_norm": 0.28125, - "learning_rate": 3.0363357070066544e-06, - "loss": 0.9707, - "step": 4070 - }, - { - "epoch": 13.919726729291204, - "grad_norm": 0.287109375, - "learning_rate": 2.939658163795844e-06, - "loss": 0.9696, - "step": 4075 - }, - { - "epoch": 13.936806148590948, - "grad_norm": 0.275390625, - "learning_rate": 2.8445217806824077e-06, - "loss": 0.9683, - "step": 4080 - }, - { - "epoch": 13.953885567890692, - "grad_norm": 0.27734375, - "learning_rate": 2.750928068277081e-06, - "loss": 0.9703, - "step": 4085 - }, - { - "epoch": 13.970964987190435, - "grad_norm": 0.287109375, - "learning_rate": 2.658878512695562e-06, - "loss": 0.9775, - "step": 4090 - }, - { - "epoch": 13.988044406490179, - "grad_norm": 0.28125, - "learning_rate": 2.5683745755348044e-06, - "loss": 0.968, - "step": 4095 - }, - { - "epoch": 13.998292058070026, - "eval_loss": 2.482388734817505, - "eval_runtime": 0.5602, - "eval_samples_per_second": 17.85, - "eval_steps_per_second": 1.785, - "step": 4098 - }, - { - "epoch": 14.005123825789923, - "grad_norm": 0.28125, - "learning_rate": 2.4794176938498837e-06, - "loss": 0.9692, - "step": 4100 - }, - { - "epoch": 14.022203245089667, - "grad_norm": 0.27734375, - "learning_rate": 2.392009280131169e-06, - "loss": 0.9693, - "step": 4105 - }, - { - "epoch": 14.039282664389411, - "grad_norm": 0.275390625, - "learning_rate": 2.30615072228183e-06, - "loss": 0.9759, - "step": 4110 - }, - { - "epoch": 14.056362083689155, - "grad_norm": 0.279296875, - "learning_rate": 2.221843383595923e-06, - "loss": 0.9748, - "step": 4115 - }, - { - "epoch": 14.073441502988898, - "grad_norm": 0.291015625, - "learning_rate": 2.139088602736616e-06, - "loss": 0.9762, - "step": 4120 - }, - { - "epoch": 14.090520922288642, - "grad_norm": 0.275390625, - "learning_rate": 2.057887693714988e-06, - "loss": 0.9779, - "step": 4125 - }, - { - "epoch": 14.107600341588386, - "grad_norm": 0.271484375, - "learning_rate": 1.9782419458692193e-06, - "loss": 0.9726, - "step": 4130 - }, - { - "epoch": 14.12467976088813, - "grad_norm": 0.267578125, - "learning_rate": 1.900152623843987e-06, - "loss": 0.964, - "step": 4135 - }, - { - "epoch": 14.141759180187874, - "grad_norm": 0.2734375, - "learning_rate": 1.8236209675705274e-06, - "loss": 0.9724, - "step": 4140 - }, - { - "epoch": 14.158838599487618, - "grad_norm": 0.263671875, - "learning_rate": 1.7486481922468489e-06, - "loss": 0.9744, - "step": 4145 - }, - { - "epoch": 14.17591801878736, - "grad_norm": 0.27734375, - "learning_rate": 1.6752354883184717e-06, - "loss": 0.9797, - "step": 4150 - }, - { - "epoch": 14.192997438087104, - "grad_norm": 0.271484375, - "learning_rate": 1.6033840214595308e-06, - "loss": 0.976, - "step": 4155 - }, - { - "epoch": 14.210076857386849, - "grad_norm": 0.283203125, - "learning_rate": 1.5330949325542797e-06, - "loss": 0.9653, - "step": 4160 - }, - { - "epoch": 14.227156276686593, - "grad_norm": 0.271484375, - "learning_rate": 1.4643693376789058e-06, - "loss": 0.9744, - "step": 4165 - }, - { - "epoch": 14.244235695986337, - "grad_norm": 0.29296875, - "learning_rate": 1.397208328083921e-06, - "loss": 0.9619, - "step": 4170 - }, - { - "epoch": 14.26131511528608, - "grad_norm": 0.279296875, - "learning_rate": 1.3316129701766878e-06, - "loss": 0.9673, - "step": 4175 - }, - { - "epoch": 14.278394534585825, - "grad_norm": 0.2734375, - "learning_rate": 1.2675843055046765e-06, - "loss": 0.9703, - "step": 4180 - }, - { - "epoch": 14.295473953885567, - "grad_norm": 0.27734375, - "learning_rate": 1.205123350738746e-06, - "loss": 0.9778, - "step": 4185 - }, - { - "epoch": 14.312553373185311, - "grad_norm": 0.279296875, - "learning_rate": 1.144231097657078e-06, - "loss": 0.9707, - "step": 4190 - }, - { - "epoch": 14.329632792485056, - "grad_norm": 0.28125, - "learning_rate": 1.0849085131294678e-06, - "loss": 0.966, - "step": 4195 - }, - { - "epoch": 14.3467122117848, - "grad_norm": 0.26953125, - "learning_rate": 1.0271565391018922e-06, - "loss": 0.9732, - "step": 4200 - }, - { - "epoch": 14.363791631084544, - "grad_norm": 0.296875, - "learning_rate": 9.709760925816325e-07, - "loss": 0.9701, - "step": 4205 - }, - { - "epoch": 14.380871050384286, - "grad_norm": 0.283203125, - "learning_rate": 9.163680656226303e-07, - "loss": 0.9754, - "step": 4210 - }, - { - "epoch": 14.39795046968403, - "grad_norm": 0.279296875, - "learning_rate": 8.633333253113995e-07, - "loss": 0.9676, - "step": 4215 - }, - { - "epoch": 14.415029888983774, - "grad_norm": 0.283203125, - "learning_rate": 8.118727137532034e-07, - "loss": 0.9781, - "step": 4220 - }, - { - "epoch": 14.432109308283518, - "grad_norm": 0.28125, - "learning_rate": 7.619870480587099e-07, - "loss": 0.9664, - "step": 4225 - }, - { - "epoch": 14.449188727583262, - "grad_norm": 0.275390625, - "learning_rate": 7.136771203310245e-07, - "loss": 0.9772, - "step": 4230 - }, - { - "epoch": 14.466268146883007, - "grad_norm": 0.2734375, - "learning_rate": 6.669436976530885e-07, - "loss": 0.9685, - "step": 4235 - }, - { - "epoch": 14.48334756618275, - "grad_norm": 0.275390625, - "learning_rate": 6.21787522075512e-07, - "loss": 0.9722, - "step": 4240 - }, - { - "epoch": 14.500426985482493, - "grad_norm": 0.2734375, - "learning_rate": 5.782093106048159e-07, - "loss": 0.969, - "step": 4245 - }, - { - "epoch": 14.517506404782237, - "grad_norm": 0.279296875, - "learning_rate": 5.362097551919631e-07, - "loss": 0.9691, - "step": 4250 - }, - { - "epoch": 14.534585824081981, - "grad_norm": 0.275390625, - "learning_rate": 4.957895227215015e-07, - "loss": 0.9683, - "step": 4255 - }, - { - "epoch": 14.551665243381725, - "grad_norm": 0.27734375, - "learning_rate": 4.569492550008603e-07, - "loss": 0.9676, - "step": 4260 - }, - { - "epoch": 14.56874466268147, - "grad_norm": 0.279296875, - "learning_rate": 4.1968956875020336e-07, - "loss": 0.9796, - "step": 4265 - }, - { - "epoch": 14.585824081981212, - "grad_norm": 0.275390625, - "learning_rate": 3.84011055592659e-07, - "loss": 0.9671, - "step": 4270 - }, - { - "epoch": 14.602903501280956, - "grad_norm": 0.275390625, - "learning_rate": 3.49914282044872e-07, - "loss": 0.9776, - "step": 4275 - }, - { - "epoch": 14.6199829205807, - "grad_norm": 0.27734375, - "learning_rate": 3.1739978950806603e-07, - "loss": 0.9661, - "step": 4280 - }, - { - "epoch": 14.637062339880444, - "grad_norm": 0.28125, - "learning_rate": 2.864680942594178e-07, - "loss": 0.9708, - "step": 4285 - }, - { - "epoch": 14.654141759180188, - "grad_norm": 0.2890625, - "learning_rate": 2.5711968744382974e-07, - "loss": 0.9728, - "step": 4290 - }, - { - "epoch": 14.671221178479932, - "grad_norm": 0.271484375, - "learning_rate": 2.2935503506621436e-07, - "loss": 0.9742, - "step": 4295 - }, - { - "epoch": 14.688300597779676, - "grad_norm": 0.291015625, - "learning_rate": 2.0317457798398888e-07, - "loss": 0.9753, - "step": 4300 - }, - { - "epoch": 14.705380017079419, - "grad_norm": 0.30859375, - "learning_rate": 1.7857873190019192e-07, - "loss": 0.9707, - "step": 4305 - }, - { - "epoch": 14.722459436379163, - "grad_norm": 0.27734375, - "learning_rate": 1.5556788735676675e-07, - "loss": 0.971, - "step": 4310 - }, - { - "epoch": 14.739538855678907, - "grad_norm": 0.271484375, - "learning_rate": 1.3414240972843273e-07, - "loss": 0.9695, - "step": 4315 - }, - { - "epoch": 14.756618274978651, - "grad_norm": 0.271484375, - "learning_rate": 1.143026392168789e-07, - "loss": 0.9833, - "step": 4320 - }, - { - "epoch": 14.773697694278395, - "grad_norm": 0.27734375, - "learning_rate": 9.604889084532387e-08, - "loss": 0.9728, - "step": 4325 - }, - { - "epoch": 14.790777113578137, - "grad_norm": 0.30078125, - "learning_rate": 7.938145445357536e-08, - "loss": 0.9739, - "step": 4330 - }, - { - "epoch": 14.807856532877882, - "grad_norm": 0.279296875, - "learning_rate": 6.430059469334504e-08, - "loss": 0.974, - "step": 4335 - }, - { - "epoch": 14.824935952177626, - "grad_norm": 0.28125, - "learning_rate": 5.0806551024129565e-08, - "loss": 0.9763, - "step": 4340 - }, - { - "epoch": 14.84201537147737, - "grad_norm": 0.28125, - "learning_rate": 3.889953770935817e-08, - "loss": 0.978, - "step": 4345 - }, - { - "epoch": 14.859094790777114, - "grad_norm": 0.287109375, - "learning_rate": 2.8579743813006432e-08, - "loss": 0.9686, - "step": 4350 - }, - { - "epoch": 14.876174210076858, - "grad_norm": 0.27734375, - "learning_rate": 1.98473331965654e-08, - "loss": 0.9723, - "step": 4355 - }, - { - "epoch": 14.893253629376602, - "grad_norm": 0.287109375, - "learning_rate": 1.270244451652136e-08, - "loss": 0.9641, - "step": 4360 - }, - { - "epoch": 14.910333048676344, - "grad_norm": 0.28125, - "learning_rate": 7.145191222035497e-09, - "loss": 0.968, - "step": 4365 - }, - { - "epoch": 14.927412467976088, - "grad_norm": 0.2734375, - "learning_rate": 3.175661553256326e-09, - "loss": 0.9763, - "step": 4370 - }, - { - "epoch": 14.944491887275833, - "grad_norm": 0.283203125, - "learning_rate": 7.939185398431193e-10, - "loss": 0.972, - "step": 4375 - }, - { - "epoch": 14.961571306575577, - "grad_norm": 0.275390625, - "learning_rate": 0.0, - "loss": 0.9728, - "step": 4380 - }, - { - "epoch": 14.961571306575577, - "eval_loss": 2.482285976409912, - "eval_runtime": 0.5491, - "eval_samples_per_second": 18.212, - "eval_steps_per_second": 1.821, - "step": 4380 - }, - { - "epoch": 14.961571306575577, - "step": 4380, - "total_flos": 5.145390446595277e+18, - "train_loss": 1.0581742508226333, - "train_runtime": 45587.0719, - "train_samples_per_second": 9.241, - "train_steps_per_second": 0.096 + "epoch": 9.974380871050384, + "step": 2920, + "total_flos": 3.4809256003093135e+18, + "train_loss": 0.9919237802289936, + "train_runtime": 34991.5416, + "train_samples_per_second": 8.027, + "train_steps_per_second": 0.083 } ], "logging_steps": 5, - "max_steps": 4380, + "max_steps": 2920, "num_input_tokens_seen": 0, - "num_train_epochs": 15, + "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { @@ -6294,7 +4210,7 @@ "attributes": {} } }, - "total_flos": 5.145390446595277e+18, + "total_flos": 3.4809256003093135e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null